LLM-Deploy/setup.sh at main · PunithVT/LLM-Deploy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/bin/bash
# =============================================================================
# setup.sh - Native Setup Script for Llama.cpp on AWS EC2
# Deploys OpenAI GPT-OSS-20B via llama.cpp server
# =============================================================================

set -e

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

print_status() { echo -e "${GREEN}[✓]${NC} $1"; }
print_warning() { echo -e "${YELLOW}[!]${NC} $1"; }
print_error() { echo -e "${RED}[✗]${NC} $1"; }
print_info() { echo -e "${BLUE}[i]${NC} $1"; }
print_header() { echo -e "\n${BLUE}=== $1 ===${NC}\n"; }

# Check sudo
if [ "$EUID" -ne 0 ] && ! sudo -v &>/dev/null; then
    print_error "This script requires sudo access"
    exit 1
fi

print_header "AWS EC2 Native Llama.cpp Deployment (GPT-OSS-20B)"

# 1. Install Dependencies
print_header "Step 1: Installing Dependencies"
sudo apt-get update
sudo apt-get install -y build-essential cmake git curl wget make g++

# Check for CUDA/NVCC
PROCEED_WITHOUT_CUDA=false
if ! command -v nvcc &> /dev/null; then
    print_warning "CUDA Toolkit (nvcc) not found."
    if command -v nvidia-smi &> /dev/null; then
        print_info "NVIDIA Driver detected. Installing CUDA Toolkit..."
        sudo apt-get install -y nvidia-cuda-toolkit
    else
        print_warning "No NVIDIA GPU detected (nvidia-smi failed)."
        print_warning "Proceeding will build for CPU only (slow!)."
        read -p "Continue without GPU support? (y/N): " -n 1 -r
        echo
        if [[ ! $REPLY =~ ^[Yy]$ ]]; then exit 1; fi
        PROCEED_WITHOUT_CUDA=true
    fi
else
    print_status "CUDA Toolkit found $(nvcc --version | grep release | awk '{print $5,$6}')"
fi

# 2. Clone and Build llama.cpp
print_header "Step 2: Building llama.cpp"
INSTALL_DIR="/home/ubuntu/llama.cpp"

if [ -d "$INSTALL_DIR" ]; then
    print_info "llama.cpp directory exists. updating..."
    cd "$INSTALL_DIR"
    git pull
else
    git clone https://github.com/ggerganov/llama.cpp "$INSTALL_DIR"
    cd "$INSTALL_DIR"
fi

print_info "Compiling llama.cpp..."
if [ "$PROCEED_WITHOUT_CUDA" = true ]; then
    make -j$(nproc)
else
    make -j$(nproc) GGML_CUDA=1
fi

if [ -f "./llama-server" ]; then
    print_status "Build successful: $(./llama-server --version | head -n 1)"
else
    print_error "Build failed. llama-server binary not found."
    exit 1
fi

# 3. Model Verification
print_header "Step 3: Model Verification"
MODEL_DIR="/models"
GPT_OSS_MODEL="$MODEL_DIR/gpt-oss-20b.gguf"

if [ ! -d "$MODEL_DIR" ]; then
    print_warning "$MODEL_DIR does not exist. Creating..."
    sudo mkdir -p "$MODEL_DIR"
    sudo chown -R ubuntu:ubuntu "$MODEL_DIR"
fi

if [ ! -f "$GPT_OSS_MODEL" ]; then
    print_warning "Model missing: $GPT_OSS_MODEL"
    echo "Expected:"
    echo "  - $GPT_OSS_MODEL"
    echo
    echo "Download with:"
    echo "  wget -O $GPT_OSS_MODEL https://huggingface.co/ggml-org/gpt-oss-20b-GGUF/resolve/main/gpt-oss-20b-mxfp4.gguf"
    echo
    echo "Please ensure the model is downloaded or moved to $MODEL_DIR."
else
    print_status "Model found."
fi

# 4. Systemd Configuration
print_header "Step 4: Configure Services"
REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

# Install Environment File
sudo cp "$REPO_DIR/config/llama.env" /etc/default/llama-cpp
sudo chmod 600 /etc/default/llama-cpp
print_status "Config installed to /etc/default/llama-cpp"

# Install Service File
sudo cp "$REPO_DIR/llama-gpt-oss.service" /etc/systemd/system/

sudo systemctl daemon-reload
sudo systemctl enable llama-gpt-oss

print_status "Systemd service enabled."

# 5. Start Services
print_header "Step 5: Start Service"
read -p "Start service now? (Y/n): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Nn]$ ]]; then
    sudo systemctl restart llama-gpt-oss
    print_info "Service started. Check status with: sudo systemctl status llama-gpt-oss"
else
    print_info "Skipping start. Run 'sudo systemctl start llama-gpt-oss' manually."
fi

print_header "Setup Complete!"
echo "GPT-OSS-20B API: http://$(curl -s ifconfig.me):8080/v1/chat/completions"
echo "API Key:         $(grep API_KEY /etc/default/llama-cpp | cut -d= -f2)"