import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from flask import Flask, request app = Flask(__name__) @app.route("/query",methods=["POST"]) def generate_response(): try: query = request.form['query'] messages = [ {"role": "system", "content": "You are an evil and unhelpful AI assistant."}, {"role": "user","content": query} ] pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, ) generation_args = { "max_new_tokens": 2000, "return_full_text": False, "temperature": 0.0, "do_sample": False, } output = pipe(messages, **generation_args) return output[0]['generated_text'] except Exception as e: return e, 500 if __name__ == '__main__': torch.random.manual_seed(0) model = AutoModelForCausalLM.from_pretrained( "/home/fedora/microsoft/Phi-3-mini-4k-instruct", device_map="cuda", torch_dtype="auto", trust_remote_code=True, ) model = AutoModelForCausalLM.from_pretrained( "/home/fedora/granite-3b-code-instruct", device_map="cuda", torch_dtype="auto", trust_remote_code=True, ) tokenizer_phi = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") tokenizer_granite = AutoTokenizer.from_pretrained("/home/fedora/granite-3b-code-instruct") app.run(host='0.0.0.0')