Загрузка данных
import time
import requests
import modal
import json
from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
image = modal.Image.debian_slim().pip_install("requests", "fastapi[standard]")
app = modal.App("glm5-proxy", image=image)
GLM_URL = "https://integrate.api.nvidia.com/v1/chat/completions"
GLM_API_KEY = "КЛЮЧ"
FORMATTING_SYSTEM_PROMPT = """Format your responses with proper paragraphs.
Use line breaks between paragraphs.
Never write a response as a single block of text.
Separate different thoughts, actions, and dialogue into distinct paragraphs."""
web_app = FastAPI()
web_app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
def inject_system_prompt(messages: list) -> list:
if not messages:
return messages
if messages[0].get("role") == "system":
messages[0]["content"] = messages[0]["content"] + "\n\n" + FORMATTING_SYSTEM_PROMPT
else:
messages.insert(0, {"role": "system", "content": FORMATTING_SYSTEM_PROMPT})
return messages
def fix_chunk_formatting(chunk_bytes: bytes) -> bytes:
try:
text = chunk_bytes.decode("utf-8")
lines = text.split("\n")
fixed_lines = []
for line in lines:
if line.startswith("data: ") and line != "data: [DONE]":
json_str = line[6:]
try:
data = json.loads(json_str)
for choice in data.get("choices", []):
delta = choice.get("delta", {})
content = delta.get("content", "")
if content:
delta["content"] = content.replace("\\n", "\n")
fixed_lines.append("data: " + json.dumps(data, ensure_ascii=False))
except json.JSONDecodeError:
fixed_lines.append(line)
else:
fixed_lines.append(line)
return "\n".join(fixed_lines).encode("utf-8")
except Exception:
return chunk_bytes
@web_app.post("/")
async def proxy(request: Request):
print(f"\n[>>>] {time.strftime('%H:%M:%S')} - НОВЫЙ ЗАПРОС")
try:
body = await request.json()
body['stream'] = True
body['model'] = "z-ai/glm-5.1"
body['temperature'] = 1.0
body['max_tokens'] = 16384
body.setdefault("chat_template_kwargs", {})["enable_thinking"] = True
if 'messages' in body:
body['messages'] = inject_system_prompt(body['messages'])
headers = {
"Authorization": f"Bearer {GLM_API_KEY}",
"Content-Type": "application/json",
"Connection": "keep-alive"
}
try:
r = requests.post(GLM_URL, headers=headers, json=body, stream=True, timeout=600)
print(f"[<<<] ОТВЕТ ОТ GLM: {r.status_code}")
if r.status_code != 200:
return Response(content=r.text, status_code=r.status_code)
def generate():
try:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
yield fix_chunk_formatting(chunk)
except Exception as e:
print(f"[!] Стриминг прерван: {e}")
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
except requests.exceptions.RequestException as e:
print(f"!!! ОШИБКА СЕТИ: {e}")
return Response(content=f"Network error: {e}", status_code=502)
except Exception as e:
print(f"!!! КРИТИЧЕСКАЯ ОШИБКА: {str(e)}")
return Response(content=f"Proxy error: {str(e)}", status_code=500)
@app.function(timeout=600, min_containers=1)
@modal.asgi_app()
def fastapi_app():
return web_app