phi3_model_testing/testing.py

64 lines
1.9 KiB
Python

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from flask import Flask, request
app = Flask(__name__)
@app.route("/query/",methods=["POST"])
@app.route("/query/<string:req_model>",methods=["POST"])
def generate_response(req_model="phi"):
if req_model not in ['phi','granite']: return "Only models phi and granite are supported.", 400
try:
query = request.form['query']
if req_model == 'granite':
model = model_granite
tokenizer = tokenizer_granite
else:
model = model_phi
tokenizer = tokenizer_phi
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user","content": query}
]
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
)
generation_args = {
"max_new_tokens": 2000,
"return_full_text": False,
"temperature": 0.0,
"do_sample": False,
}
output = pipe(messages, **generation_args)
return output[0]['generated_text']
except Exception as e:
return e, 500
if __name__ == '__main__':
torch.random.manual_seed(0)
model_phi = AutoModelForCausalLM.from_pretrained(
"/home/fedora/microsoft/Phi-3-mini-4k-instruct",
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
)
model_granite = AutoModelForCausalLM.from_pretrained(
"/home/fedora/granite-3b-code-instruct",
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer_phi = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
tokenizer_granite = AutoTokenizer.from_pretrained("/home/fedora/granite-3b-code-instruct")
app.run(host='0.0.0.0')