import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from flask import Flask, request app = Flask(__name__) @app.route("/query/",methods=["POST"]) @app.route("/query/",methods=["POST"]) def generate_response(req_model="phi"): if req_model not in ['phi','granite']: return "Only models phi and granite are supported.", 400 try: query = request.form['query'] if req_model == 'granite': model = model_granite tokenizer = tokenizer_granite else: model = model_phi tokenizer = tokenizer_phi messages = [ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user","content": query} ] pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, ) generation_args = { "max_new_tokens": 2000, "return_full_text": False, "temperature": 0.0, "do_sample": False, } output = pipe(messages, **generation_args) return output[0]['generated_text'] except Exception as e: return e, 500 if __name__ == '__main__': torch.random.manual_seed(0) model_phi = AutoModelForCausalLM.from_pretrained( "/home/fedora/microsoft/Phi-3-mini-4k-instruct", device_map="cuda", torch_dtype="auto", trust_remote_code=True, ) model_granite = AutoModelForCausalLM.from_pretrained( "/home/fedora/granite-3b-code-instruct", device_map="cuda", torch_dtype="auto", trust_remote_code=True, ) tokenizer_phi = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") tokenizer_granite = AutoTokenizer.from_pretrained("/home/fedora/granite-3b-code-instruct") app.run(host='0.0.0.0')