-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllm_worker.py
More file actions
60 lines (48 loc) · 1.77 KB
/
llm_worker.py
File metadata and controls
60 lines (48 loc) · 1.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from multiprocessing import Queue
import torch
import time
# Limite de VRAM (opcional, apenas para log)
try:
VRAM_TOTAL = torch.cuda.get_device_properties(0).total_memory
except Exception:
VRAM_TOTAL = 0 # Se não houver GPU disponível
VRAM_LIMITS = {
"llm": float(VRAM_TOTAL * 0.45),
"stt": float(VRAM_TOTAL * 0.30),
"tts": float(VRAM_TOTAL * 0.25)
}
def llm_worker(input_queue: Queue, output_queue: Queue):
print("🦾 Iniciando LLM Worker...")
model_name = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Modelo começa na CPU
model = AutoModelForCausalLM.from_pretrained(model_name).to("cpu")
print("🧠 Modelo carregado na CPU.")
while True:
try:
item = input_queue.get()
if item is None:
break # Sinal de encerramento
print("🔄 Movendo modelo para GPU...")
model.to("cuda")
# Cria pipeline temporária
llm_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=128,
temperature=0.7,
top_k=20,
device=0 # Usa GPU
)
response = llm_pipeline(item["text"])
generated_text = response[0]["generated_text"].replace(item["text"], "").strip()
output_queue.put({"text": generated_text})
print("💤 Devolvendo modelo para CPU...")
model.to("cpu")
torch.cuda.empty_cache()
except Exception as e:
print(f"🚨 Erro no processamento: {str(e)}")
torch.cuda.empty_cache()
time.sleep(1)