64 lines
1.9 KiB
Python
64 lines
1.9 KiB
Python
import torch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
|
|
|
from flask import Flask, request
|
|
|
|
app = Flask(__name__)
|
|
|
|
|
|
@app.route("/query/",methods=["POST"])
|
|
@app.route("/query/<string:req_model>",methods=["POST"])
|
|
def generate_response(req_model="phi"):
|
|
if req_model not in ['phi','granite']: return "Only models phi and granite are supported.", 400
|
|
|
|
try:
|
|
query = request.form['query']
|
|
|
|
|
|
if req_model == 'granite':
|
|
model = model_granite
|
|
tokenizer = tokenizer_granite
|
|
else:
|
|
model = model_phi
|
|
tokenizer = tokenizer_phi
|
|
messages = [
|
|
{"role": "system", "content": "You are a helpful AI assistant."},
|
|
{"role": "user","content": query}
|
|
]
|
|
pipe = pipeline(
|
|
"text-generation",
|
|
model=model,
|
|
tokenizer=tokenizer,
|
|
)
|
|
|
|
generation_args = {
|
|
"max_new_tokens": 2000,
|
|
"return_full_text": False,
|
|
"temperature": 0.0,
|
|
"do_sample": False,
|
|
}
|
|
|
|
output = pipe(messages, **generation_args)
|
|
return output[0]['generated_text']
|
|
|
|
except Exception as e:
|
|
return e, 500
|
|
|
|
if __name__ == '__main__':
|
|
torch.random.manual_seed(0)
|
|
model_phi = AutoModelForCausalLM.from_pretrained(
|
|
"/home/fedora/microsoft/Phi-3-mini-4k-instruct",
|
|
device_map="cuda",
|
|
torch_dtype="auto",
|
|
trust_remote_code=True,
|
|
)
|
|
model_granite = AutoModelForCausalLM.from_pretrained(
|
|
"/home/fedora/granite-3b-code-instruct",
|
|
device_map="cuda",
|
|
torch_dtype="auto",
|
|
trust_remote_code=True,
|
|
)
|
|
|
|
tokenizer_phi = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
|
|
tokenizer_granite = AutoTokenizer.from_pretrained("/home/fedora/granite-3b-code-instruct")
|
|
app.run(host='0.0.0.0')
|