diff --git a/app/src/app/tools/[toolId]/page.tsx b/app/src/app/tools/[toolId]/page.tsx
index 154a99e..3c0901c 100644
--- a/app/src/app/tools/[toolId]/page.tsx
+++ b/app/src/app/tools/[toolId]/page.tsx
@@ -1,9 +1,9 @@
 "use client";
 
 import { Button, Label, Textarea } from "@ansospace/ui";
-import { ArrowUpRight, Crown, Lock, Play, X } from "lucide-react";
+import { ArrowUpRight, Crown, Lock, Play, Plus, X } from "lucide-react";
 import { notFound, useParams } from "next/navigation";
-import { useCallback, useEffect, useState } from "react";
+import { useCallback, useEffect, useRef, useState } from "react";
 import { CodeEditor } from "@/components/code-editor";
 import { ResultViewer } from "@/components/result-viewer";
 import { ToolLayout } from "@/components/tool-layout";
@@ -44,6 +44,9 @@ function ToolPageContent({ toolId }: { toolId: string }) {
 		return initial;
 	});
 
+	// Length selector modal state
+	const [showLengthModal, setShowLengthModal] = useState(false);
+
 	// Tier2 tools MUST bypass Next.js proxy buffering to prevent silent timeouts on long executions
 	const runnerUrl = process.env.NEXT_PUBLIC_TOOL_RUNNER_URL || "http://localhost:9080";
 	const apiBase = tool.tier === "tier2" ? `${runnerUrl}/api/tools` : "/api/tools";
@@ -83,23 +86,25 @@ function ToolPageContent({ toolId }: { toolId: string }) {
 		}
 	}, [mounted, toolUsage.limitReached]);
 
+	// Check if all required fields are filled
+	const isReady = tool.requiredFields.every((field) => fields[field]?.trim());
+
 	const handleExecute = () => {
-		// Check per-tool usage limit
 		if (!canExecute(tool.id)) {
 			setShowUpgradeDialog(true);
 			return;
 		}
-		// Check required fields
 		for (const field of tool.requiredFields) {
 			if (!fields[field]?.trim()) return;
 		}
-		execute({ ...fields, model });
-		// Track usage for THIS tool
-		trackExecution(tool.id);
-	};
 
-	// Check if all required fields are filled
-	const isReady = tool.requiredFields.every((field) => fields[field]?.trim());
+		if (tool.requireLengthSelection) {
+			setShowLengthModal(true);
+		} else {
+			execute({ ...fields, model });
+			trackExecution(tool.id);
+		}
+	};
 
 	return (
 		<>
@@ -117,6 +122,7 @@ function ToolPageContent({ toolId }: { toolId: string }) {
 							config={input}
 							value={fields[input.key] || ""}
 							onChange={(value) => setField(input.key, value)}
+							onFieldChange={(key, value) => setField(key, value)}
 						/>
 					))}
 
@@ -202,11 +208,66 @@ function ToolPageContent({ toolId }: { toolId: string }) {
 					{/* Results */}
 					<div className="space-y-2">
 						<Label>Result</Label>
-						<ResultViewer result={result} isLoading={isLoading} error={error} streaming />
+						{tool.ResultComponent && result ? (
+							<tool.ResultComponent result={result} isLoading={isLoading} error={error} />
+						) : (
+							<ResultViewer result={result} isLoading={isLoading} error={error} streaming />
+						)}
 					</div>
 				</div>
 			</ToolLayout>
 
+			{/* Length selector modal (for tools that require it) */}
+			{tool.requireLengthSelection && showLengthModal && (
+				<div className="fixed inset-0 z-50 flex items-center justify-center bg-black/60 backdrop-blur-sm animate-in fade-in duration-200">
+					<div className="relative mx-4 w-full max-w-sm rounded-2xl border border-border/50 bg-card p-6 shadow-2xl animate-in zoom-in-95 duration-200">
+						<button
+							type="button"
+							onClick={() => setShowLengthModal(false)}
+							className="absolute right-4 top-4 rounded-full p-1 text-muted-foreground hover:bg-muted hover:text-foreground transition-colors"
+						>
+							<X className="h-4 w-4" />
+						</button>
+
+						<div className="mx-auto mb-4 flex h-12 w-12 items-center justify-center rounded-full bg-primary/10">
+							<Play className="h-6 w-6 text-primary" />
+						</div>
+
+						<h3 className="text-center text-lg font-semibold">Caption Length</h3>
+						<p className="mt-1 text-center text-sm text-muted-foreground">
+							How long do you want your captions to be?
+						</p>
+
+						<div className="mt-5 flex gap-3">
+							<Button
+								variant="outline"
+								className="flex-1 h-auto flex-col gap-1 py-4"
+								onClick={() => {
+									setShowLengthModal(false);
+									execute({ ...fields, model, length_type: "short" });
+									trackExecution(tool.id);
+								}}
+							>
+								<span className="text-base font-semibold">Short</span>
+								<span className="text-xs text-muted-foreground">Quick & punchy</span>
+							</Button>
+							<Button
+								variant="outline"
+								className="flex-1 h-auto flex-col gap-1 py-4"
+								onClick={() => {
+									setShowLengthModal(false);
+									execute({ ...fields, model, length_type: "long" });
+									trackExecution(tool.id);
+								}}
+							>
+								<span className="text-base font-semibold">Long</span>
+								<span className="text-xs text-muted-foreground">Detailed & descriptive</span>
+							</Button>
+						</div>
+					</div>
+				</div>
+			)}
+
 			{/* ─── Upgrade Dialog Popup ──────────────────────────── */}
 			{showUpgradeDialog && (
 				<div className="fixed inset-0 z-50 flex items-center justify-center bg-black/60 backdrop-blur-sm animate-in fade-in duration-200">
@@ -297,11 +358,18 @@ function InputField({
 	config,
 	value,
 	onChange,
+	onFieldChange,
 }: {
 	config: InputFieldConfig;
 	value: string;
 	onChange: (value: string) => void;
+	onFieldChange?: (key: string, value: string) => void;
 }) {
+	const textareaFileRef = useRef<HTMLInputElement>(null);
+	const [textareaShowPreview, setTextareaShowPreview] = useState(false);
+	const [textareaSpinning, setTextareaSpinning] = useState(false);
+	const [textareaAttachedImage, setTextareaAttachedImage] = useState("");
+
 	switch (config.type) {
 		case "code":
 			return (
@@ -316,19 +384,103 @@ function InputField({
 				</div>
 			);
 
-		case "textarea":
+		case "textarea": {
+			const hasImage = config.attachable && !!textareaAttachedImage;
 			return (
 				<div className="space-y-2">
 					<Label>{config.label}</Label>
-					<Textarea
-						value={value}
-						onChange={(e) => onChange(e.target.value)}
-						placeholder={config.placeholder}
-						rows={config.rows || 4}
-						className="resize-none"
-					/>
+					<div
+						className={`relative rounded-md border border-input bg-background transition-all duration-200 ${
+							hasImage ? "ring-1 ring-primary/20" : ""
+						} ${hasImage ? "focus-within:ring-2 focus-within:ring-primary/30" : ""}`}
+					>
+						{hasImage && (
+							<div className="absolute left-2 top-2 z-10">
+								<button
+									type="button"
+									onClick={() => setTextareaShowPreview(true)}
+									className="h-12 w-12 overflow-hidden rounded-md border border-input shadow-xs hover:shadow-md transition-shadow"
+								>
+									<img
+										src={textareaAttachedImage}
+										alt="Attached"
+										className="h-full w-full object-cover"
+									/>
+								</button>
+								<button
+									type="button"
+									onClick={() => {
+										setTextareaAttachedImage("");
+										onFieldChange?.("image", "");
+									}}
+									className="absolute -top-2 -right-2 flex h-5 w-5 items-center justify-center rounded-full bg-background border border-input shadow-xs hover:bg-destructive/10 hover:text-destructive hover:border-destructive/30 transition-colors"
+								>
+									<X className="h-3 w-3" />
+								</button>
+							</div>
+						)}
+						<Textarea
+							value={value}
+							onChange={(e) => onChange(e.target.value)}
+							placeholder={config.placeholder}
+							rows={config.rows || 4}
+							className={`resize-none border-0 bg-transparent focus-visible:ring-0 focus-visible:ring-offset-0 ${
+								hasImage ? "pl-20" : "pl-14"
+							} min-h-[120px]`}
+						/>
+						{config.attachable && (
+							<>
+								<button
+									type="button"
+									onClick={() => {
+										setTextareaSpinning(true);
+										textareaFileRef.current?.click();
+										setTimeout(() => setTextareaSpinning(false), 400);
+									}}
+									className="absolute bottom-2 left-2 flex h-8 w-8 items-center justify-center rounded-full text-muted-foreground hover:text-primary transition-colors"
+								>
+									<Plus
+										className={`h-4 w-4 transition-transform duration-300 ${
+											textareaSpinning ? "rotate-180 scale-110" : ""
+										}`}
+									/>
+								</button>
+								<input
+									ref={textareaFileRef}
+									type="file"
+									accept={config.attachable.accept}
+									className="hidden"
+									onChange={(e) => {
+										const file = e.target.files?.[0];
+										if (!file) return;
+										const reader = new FileReader();
+										reader.onloadend = () => {
+											const dataUrl = reader.result as string;
+											setTextareaAttachedImage(dataUrl);
+											onFieldChange?.("image", dataUrl);
+										};
+										reader.readAsDataURL(file);
+									}}
+								/>
+							</>
+						)}
+					</div>
+					{hasImage && textareaShowPreview && (
+						<button
+							type="button"
+							className="fixed inset-0 z-50 flex cursor-pointer items-center justify-center bg-black/70 backdrop-blur-sm animate-in fade-in duration-200"
+							onClick={() => setTextareaShowPreview(false)}
+						>
+							<img
+								src={textareaAttachedImage}
+								alt="Preview"
+								className="max-h-[85vh] max-w-[90vw] rounded-lg object-contain shadow-2xl animate-in zoom-in-95 duration-200"
+							/>
+						</button>
+					)}
 				</div>
 			);
+		}
 
 		case "select":
 			return (
@@ -479,7 +631,7 @@ function InputField({
 											)
 										);
 										const valid = Array.from(files).filter((f) => {
-											const ext = "." + f.name.split(".").pop()?.toLowerCase();
+											const ext = `.${f.name.split(".").pop()?.toLowerCase()}`;
 											const p = f.webkitRelativePath || f.name;
 											if (
 												p.includes("__pycache__") ||
diff --git a/app/src/components/tools/caption-result-display.tsx b/app/src/components/tools/caption-result-display.tsx
new file mode 100644
index 0000000..79b163a
--- /dev/null
+++ b/app/src/components/tools/caption-result-display.tsx
@@ -0,0 +1,142 @@
+"use client";
+
+import { Check, Copy } from "lucide-react";
+import { useCallback, useState } from "react";
+
+function VariationCopyButton({ text }: { text: string }) {
+	const [copied, setCopied] = useState(false);
+	const handleCopy = useCallback(async () => {
+		await navigator.clipboard.writeText(text);
+		setCopied(true);
+		setTimeout(() => setCopied(false), 2000);
+	}, [text]);
+
+	return (
+		<button
+			type="button"
+			onClick={handleCopy}
+			className="inline-flex items-center gap-1.5 rounded-md px-2.5 py-1 text-xs font-medium transition-colors hover:bg-primary/10 hover:text-primary data-[copied=true]:text-green-500"
+			data-copied={copied}
+		>
+			{copied ? <Check className="h-3 w-3" /> : <Copy className="h-3 w-3" />}
+			{copied ? "Copied" : "Copy"}
+		</button>
+	);
+}
+
+function CaptionVariationDisplay({
+	variations: rawVariations,
+	title,
+	platformName,
+	lengthType,
+}: {
+	variations?: { text: string; chars: number; limit: number; title?: string }[];
+	title?: string | null;
+	platformName?: string;
+	lengthType?: string;
+}) {
+	const [activeIdx, setActiveIdx] = useState(0);
+	if (!rawVariations || rawVariations.length === 0) return null;
+
+	const v = rawVariations[activeIdx];
+	const varTitle = v.title || title;
+	const copyText = varTitle ? `Title: ${varTitle}\n\nCaption: ${v.text}` : v.text;
+	const charRatio = v.chars / v.limit;
+	const barWidth = Math.min(charRatio * 100, 100);
+	const barColor =
+		charRatio > 1.0 ? "bg-red-500" : charRatio > 0.8 ? "bg-amber-500" : "bg-green-500";
+
+	return (
+		<div className="space-y-4">
+			{platformName && (
+				<div className="flex items-center justify-between">
+					<h3 className="text-base font-semibold text-foreground">{platformName}</h3>
+					<span className="rounded-full bg-muted px-2.5 py-0.5 text-xs font-medium text-muted-foreground">
+						{lengthType === "short" ? "Short" : "Long"}
+					</span>
+				</div>
+			)}
+
+			<div className="flex gap-1.5">
+				{rawVariations.map((v, i) => (
+					<button
+						key={v.text}
+						type="button"
+						onClick={() => setActiveIdx(i)}
+						className={`flex-1 rounded-md px-3 py-1.5 text-xs font-medium transition-colors ${
+							activeIdx === i
+								? "bg-primary text-primary-foreground"
+								: "bg-muted text-muted-foreground hover:bg-muted/80"
+						}`}
+					>
+						Variation {i + 1}
+					</button>
+				))}
+			</div>
+
+			<div className="space-y-2">
+				{varTitle && (
+					<div className="rounded-lg border border-primary/20 bg-primary/[0.02] p-3">
+						<div className="flex items-center justify-between">
+							<div>
+								<p className="text-xs text-muted-foreground mb-0.5">Title {activeIdx + 1}</p>
+								<p className="text-sm font-medium text-foreground">{varTitle}</p>
+							</div>
+							<VariationCopyButton text={varTitle} />
+						</div>
+					</div>
+				)}
+
+				<div className="rounded-lg border border-border bg-card p-4">
+					<div className="flex items-start justify-between gap-4">
+						<p className="text-sm leading-relaxed text-foreground/90 whitespace-pre-wrap flex-1 min-w-0">
+							{v.text}
+						</p>
+						<VariationCopyButton text={copyText} />
+					</div>
+					<div className="mt-3 flex items-center gap-2">
+						<div className="h-1.5 flex-1 overflow-hidden rounded-full bg-muted">
+							<div
+								className={`${barColor} h-full rounded-full transition-all`}
+								style={{ width: `${barWidth}%` }}
+							/>
+						</div>
+						<span className="text-xs tabular-nums text-muted-foreground shrink-0">
+							{v.chars} / {v.limit}
+						</span>
+					</div>
+				</div>
+			</div>
+		</div>
+	);
+}
+
+export function CaptionResultDisplay({ result }: { result: string }) {
+	let parsed: {
+		variations?: { text: string; chars: number; limit: number; title?: string }[];
+		title?: string | null;
+		metadata?: { platform_name?: string; platform?: string; length_type?: string };
+	} | null = null;
+
+	try {
+		const data = JSON.parse(result);
+		if (data.variations || data.title) {
+			parsed = data;
+		}
+	} catch {
+		// not JSON, fall through to raw display
+	}
+
+	if (!parsed) {
+		return <pre className="whitespace-pre-wrap text-sm">{result}</pre>;
+	}
+
+	return (
+		<CaptionVariationDisplay
+			variations={parsed.variations}
+			title={parsed.title}
+			platformName={parsed.metadata?.platform_name || parsed.metadata?.platform}
+			lengthType={parsed.metadata?.length_type}
+		/>
+	);
+}
diff --git a/app/src/lib/tools/caption-generator.ts b/app/src/lib/tools/caption-generator.ts
index a419e17..4c33a4c 100644
--- a/app/src/lib/tools/caption-generator.ts
+++ b/app/src/lib/tools/caption-generator.ts
@@ -1,4 +1,5 @@
-﻿import type { ToolDefinition } from "@/types";
+﻿import { CaptionResultDisplay } from "@/components/tools/caption-result-display";
+import type { ToolDefinition } from "@/types";
 
 export const captionGenerator: ToolDefinition = {
 	id: "caption-generator",
@@ -8,58 +9,37 @@ export const captionGenerator: ToolDefinition = {
 	category: "content",
 	icon: "PenTool",
 	status: "active",
-
-	requiredFields: ["contentDescription"],
-	defaultModel: "llama-3.3-70b",
-
-	buildSystemPrompt: ({ platform }) =>
-		`You are a social media content strategist. Generate engaging, platform-optimized captions. Rules:
-
-1. **Match the platform tone** - ${platform || "All platforms"} style and conventions
-2. **Hook first** - Start with an attention-grabbing line
-3. **Include CTAs** - ask questions, invite engagement
-4. **Hashtags** - 5-10 relevant hashtags (platform-appropriate)
-5. **Emojis** - Use strategically, not excessively
-6. **Character limits** - Respect platform limits (Twitter: 280, Instagram caption: 2200)
-
-Generate 3 caption variations: Professional, Casual, and Bold/Edgy.`,
-
-	buildUserPrompt: ({ contentDescription, platform, tone, cta }) =>
-		`**CONTENT:** ${contentDescription}\n\n**PLATFORM:** ${platform || "All platforms"}\n\n${tone ? `**TONE:** ${tone}\n` : ""}${cta ? `**CALL TO ACTION:** ${cta}\n` : ""}\n\nGenerate 3 caption variations.`,
+	tier: "tier2",
+	requiredFields: ["prompt", "platform"],
+	defaultModel: "kimi-k2.5",
+	buildSystemPrompt: () => "",
+	buildUserPrompt: () => "",
 
 	inputs: [
-		{
-			key: "contentDescription",
-			label: "Content Description",
-			type: "textarea",
-			placeholder:
-				"E.g. 'We just launched our AI-powered developer tools platform. It has 22+ free tools for debugging, testing, and code generation.'",
-			rows: 4,
-		},
 		{
 			key: "platform",
 			label: "Platform",
 			type: "select",
 			options: [
-				{ value: "All platforms", label: "All Platforms" },
-				{ value: "Instagram", label: "Instagram" },
-				{ value: "Twitter/X", label: "Twitter / X" },
-				{ value: "LinkedIn", label: "LinkedIn" },
-				{ value: "TikTok", label: "TikTok" },
-				{ value: "YouTube", label: "YouTube (description)" },
+				{ value: "youtube", label: "YouTube" },
+				{ value: "youtube_shorts", label: "YouTube Shorts" },
+				{ value: "tiktok", label: "TikTok" },
+				{ value: "instagram", label: "Instagram" },
+				{ value: "reddit", label: "Reddit" },
+				{ value: "linkedin", label: "LinkedIn" },
+				{ value: "x_twitter", label: "X (Twitter)" },
 			],
 		},
 		{
-			key: "tone",
-			label: "Tone (optional)",
-			type: "text",
-			placeholder: "E.g. 'Professional but approachable'",
-		},
-		{
-			key: "cta",
-			label: "Call to Action (optional)",
-			type: "text",
-			placeholder: "E.g. 'Sign up for the beta'",
+			key: "prompt",
+			label: "Caption Prompt",
+			type: "textarea",
+			rows: 6,
+			attachable: { accept: "image/jpeg,image/png,image/webp,image/gif" },
+			placeholder:
+				"E.g. 'We just launched our AI-powered developer tools platform. It has 22+ free tools for debugging, testing, and code generation.'",
 		},
 	],
+	ResultComponent: CaptionResultDisplay,
+	requireLengthSelection: true,
 };
diff --git a/app/src/types/index.ts b/app/src/types/index.ts
index 5ba04ee..088e3ad 100644
--- a/app/src/types/index.ts
+++ b/app/src/types/index.ts
@@ -11,6 +11,8 @@ export interface InputFieldConfig {
 	options?: { value: string; label: string }[];
 	/** For "files" type: accepted file extensions (e.g. ".py,.js,.zip") */
 	accept?: string;
+	/** For "textarea" type: allows attaching files (e.g. images) */
+	attachable?: { accept: string };
 	/** For "files" type: max number of files */
 	maxFiles?: number;
 	/** For "files" type: max total upload size in MB */
@@ -55,6 +57,14 @@ export interface ToolDefinition {
 	// --- UI config ---
 	/** Declarative form field definitions */
 	inputs: InputFieldConfig[];
+	/** Custom component for rendering tool-specific results. Receives raw result string. */
+	ResultComponent?: React.ComponentType<{
+		result: string;
+		isLoading?: boolean;
+		error?: { message: string; code?: string; action?: string } | null;
+	}>;
+	/** When true, shows a length-selection (short/long) dialog before execution. */
+	requireLengthSelection?: boolean;
 }
 
 export interface CategoryInfo {
diff --git a/services/python-tools/tools/caption-generator/generator.py b/services/python-tools/tools/caption-generator/generator.py
new file mode 100644
index 0000000..d48b03d
--- /dev/null
+++ b/services/python-tools/tools/caption-generator/generator.py
@@ -0,0 +1,314 @@
+import re
+import asyncio
+from openai import AsyncOpenAI
+from rules import PLATFORM_LIMITS, PLATFORM_REVERSE_MAP, get_limits
+
+
+def sanitize_output(text: str) -> str:
+    text = text.replace("\\n", "\n")
+    text = re.sub(r'-{2,}', '-', text)
+    text = re.sub(r'—', '-', text)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    text = text.strip()
+    return text
+
+async def extra_text(image_data:str, api_key: str) ->str:
+    if not image_data:
+        return ""
+    
+    client = AsyncOpenAI(
+        base_url="https://api.oxlo.ai/v1",
+        api_key=api_key
+    )
+    
+    try:
+        response = await client.chat.completions.create(
+            model="kimi-k2.5",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Extract all visible text from this image. Return only the text content, nothing else."},
+                        {"type": "image_url", "image_url": {"url": image_data}}
+                    ]
+                }
+            ],
+            max_tokens=1000,
+        )
+        return response.choices[0].message.content or ""
+    except Exception as e:
+        print(f"OCR error: {e}")
+        return ""
+
+def build_prompt(platform: str,length_type: str, prompt: str, context_from_image: str = "") -> str:
+    plat = PLATFORM_LIMITS.get(platform, PLATFORM_LIMITS["linkedin"])
+    limits = get_limits(platform, length_type)
+    
+    is_short = length_type == "short"
+    is_reddit = platform == "reddit"
+    title_optional = plat.get("title_optional", True)
+    has_title = (plat.get("title_max", 0) > 0 and title_optional) or "title_short_min" in plat
+    
+    style_guides = {
+        "engaging_informative": "engaging, informative, YouTube-friendly. Hook viewers in the first line. Clear and conversational.",
+        "punchy_short_form": "punchy, fast-paced, hook-first. Perfect for short attention spans. Bold and energetic.",
+        "trending_energetic": "trending, energetic, TikTok-native. Use popular phrases naturally. Fun and relatable.",
+        "visual_storytelling": "visual-friendly, storytelling-focused. Complement the image/video. Emotional and engaging.",
+        "authentic_community": "authentic, community-focused, Reddit-native. No clickbait. Honest and direct.",
+        "professional_thoughtful": "professional, thought-provoking, LinkedIn-appropriate. No emojis or minimal. Value-driven.",
+        "concise_punchy": "concise, punchy, hook-first. Every word counts. Bold and direct.",
+    }
+    style_guide = style_guides.get(plat.get("style", "concise_punchy"), "concise and engaging")
+    
+    emoji_count = plat.get("emoji_limit", (1, 3))
+    emoji_guide = f"Use {emoji_count[0]}-{emoji_count[1]} emojis, placed at the end of sentences or at the very end of the caption."
+    if plat.get("style") == "professional_thoughtful":
+        emoji_guide = "Use 0-1 emoji only if truly needed, or skip emojis entirely."
+    
+    cta_patterns = plat.get("cta_patterns", [])
+    cta_text = ", ".join(cta_patterns[:3])
+    
+    hashtag_count = plat.get("hashtag_count", (3, 5))
+    if hashtag_count[1] == 0:
+        hashtag_text = "Do NOT use any hashtags."
+    else:
+        hashtag_text = f"Add {hashtag_count[0]}-{hashtag_count[1]} relevant hashtags at the end on a new line."
+    
+    word_limits = {
+        "youtube": {"short": 30, "long": 60},
+        "youtube_shorts": {"short": 15, "long": 25},
+        "tiktok": {"short": 15, "long": 25},
+        "instagram": {"short": 15, "long": 25},
+        "reddit": {"short": 100, "long": 200},
+        "linkedin": {"short": 100, "long": 200},
+        "x_twitter": {"short": 50, "long": 100}
+    }
+    max_words = word_limits.get(platform, {}).get(length_type, 25)
+
+    if is_reddit:
+        title_min = limits.get("title_min", 50)
+        title_max = limits.get("title_max", 120)
+        caption_min = limits.get("caption_min", 500)
+        caption_max = limits.get("caption_max", 1000)
+        
+        prompt_text = f"""Create 3 Reddit posts with different titles.
+
+Write 3 posts:
+Title 1: [title {title_min}-{title_max} chars]
+Description 1: [post {caption_min}-{caption_max} chars, {max_words} words max]
+
+Title 2: [different title]
+Description 2: [different post]
+
+Title 3: [different title]
+Description 3: [different post]
+
+Rules:
+- All 3 titles must use different words
+- Use line breaks in body
+- End each with a question
+- {emoji_guide}
+
+Topic: {prompt}"""
+        
+        if context_from_image:
+            prompt_text += f"\n\nAdditional context from image: {context_from_image}"
+        
+        return prompt_text
+    
+    elif has_title:
+        title_max_val = plat.get("title_max", 60)
+        caption_min = limits.get("caption_min", 60)
+        caption_max = limits.get("caption_max", 100)
+        
+        prompt_text = f"""Create 3 YouTube video captions with different titles.
+
+Write 3 posts:
+Title 1: [title max {title_max_val} chars]
+Description 1: [caption {caption_min}-{caption_max} chars, {max_words} words max - make it detailed and engaging]
+
+Title 2: [different title]
+Description 2: [different caption]
+
+Title 3: [different title]
+Description 3: [different caption]
+
+Rules:
+- All 3 titles must use different words/angles
+- Don't repeat same title words
+- Descriptions should be {caption_min}-{caption_max} characters - write close to the max
+- {hashtag_text}
+- {cta_text}
+- {emoji_guide}
+
+Topic: {prompt}"""
+        
+        if context_from_image:
+            prompt_text += f"\n\nAdditional context from image: {context_from_image}"
+        
+        return prompt_text
+    
+    else:
+        caption_min = limits.get("caption_min", 100)
+        caption_max = limits.get("caption_max", 280)
+        
+        prompt_text = f"""Write a natural, human-like social media caption.
+
+Platform: {plat['name']}
+Style: {style_guide}
+
+Write EXACTLY {caption_min}-{caption_max} characters.
+
+Requirements:
+- Hook viewers in the first line - this is the most important part
+- Be engaging, {style_guide}
+- {hashtag_text}
+- {cta_text}
+- {emoji_guide}
+- Do NOT use double dashes (--) or em dashes (---) - use a single hyphen (-) instead
+- Write like a real person, not like an AI
+- MAXIMUM {max_words} WORDS
+
+User's topic: {prompt}"""
+        
+        if context_from_image:
+            prompt_text += f"\n\nAdditional context from image: {context_from_image}"
+        
+        return prompt_text
+
+
+async def generate_caption(
+    client: AsyncOpenAI,
+    prompt: str,
+    platform: str,
+    length_type: str,
+    context_from_image: str = ""
+) -> str:
+    enhanced_prompt = build_prompt(platform, length_type, prompt, context_from_image)
+    
+    plat = PLATFORM_LIMITS.get(platform, PLATFORM_LIMITS["linkedin"])
+    limits = get_limits(platform, length_type)
+    max_tokens = min(limits.get("caption_max", 280) // 4, 500)
+    
+    response = await client.chat.completions.create(
+        model="kimi-k2.5",
+        messages=[
+            {
+                "role": "system",
+                "content": f"You are a social media caption writing assistant. Return ONLY the caption text - no explanations, no markdown formatting, no extra text. CRITICAL: The caption must be EXACTLY between {limits.get('caption_min', 50)} and {limits.get('caption_max', 100)} characters. NEVER exceed {limits.get('caption_max', 100)} characters. Do not use double dashes (--) or em dashes (---). Use a single hyphen (-) instead."
+            },
+            {"role": "user", "content": enhanced_prompt}
+        ],
+        temperature=0.7,
+        max_tokens=max_tokens,
+    )
+    
+    return response.choices[0].message.content or ""
+
+
+async def generate_all_captions(
+    client: AsyncOpenAI,
+    prompt: str,
+    platform: str,
+    length_type: str,
+    context_from_image: str = ""
+) -> dict:
+    plat = PLATFORM_LIMITS.get(platform, PLATFORM_LIMITS["linkedin"])
+    limits = get_limits(platform, length_type)
+    is_reddit = platform == "reddit"
+    
+    variations = await asyncio.gather(
+        generate_caption(client, prompt, platform, length_type, context_from_image),
+        generate_caption(client, prompt, platform, length_type, context_from_image),
+        generate_caption(client, prompt, platform, length_type, context_from_image),
+    )
+    
+    results = []
+    titles = []
+    title_optional = plat.get("title_optional", True)
+    has_title = (plat.get("title_max", 0) > 0 and title_optional) or "title_short_min" in plat
+    
+    all_variations_text = []
+    for variation in variations:
+        if variation:
+            text = sanitize_output(variation.strip())
+            parts = re.split(r'(?:Title\s*\d*:)|(?:Description\s*\d*:)|(?:Option\s*[123]:)|(?:\d+\.)|(?:---)', text, flags=re.IGNORECASE)
+            found_parts = [p.strip() for p in parts if p.strip() and len(p.strip()) > 10]
+            if len(found_parts) >= 3:
+                all_variations_text.extend(found_parts[:3])
+            else:
+                all_variations_text.append(text)
+
+    for text in all_variations_text[:3]:
+        var_title = None
+        
+        if has_title or is_reddit:
+            lines = [l.strip() for l in text.split("\n") if l.strip()]
+            
+            title_pattern = re.search(r'(?:Title\s*\d*[\s:]*)', text, re.IGNORECASE)
+            if title_pattern:
+                start = title_pattern.end()
+                remaining = text[start:].strip()
+                newline_pos = remaining.find("\n")
+                if newline_pos > 0:
+                    potential_title = remaining[:newline_pos].strip()
+                else:
+                    potential_title = remaining.strip()
+                
+                title_limit = plat.get("title_max", 60)
+                if is_reddit:
+                    title_limit = limits.get("title_max", 120)
+                if potential_title and len(potential_title) <= title_limit:
+                    var_title = potential_title
+                    after_title = remaining[newline_pos:] if newline_pos > 0 else ""
+                    text = after_title.strip()
+            
+            if not var_title and lines:
+                first_line = lines[0]
+                title_limit = plat.get("title_max", 60)
+                if is_reddit:
+                    title_limit = limits.get("title_max", 120)
+                if len(first_line) <= title_limit and not first_line.startswith("#"):
+                    var_title = first_line
+                    text = " ".join(lines[1:]) if len(lines) > 1 else ""
+            
+            text = re.sub(r'^(?:Description\s*\d*:)\s*', '', text, flags=re.IGNORECASE).strip()
+            
+            caption_text = text
+        else:
+            caption_text = text
+        
+        caption_limit = limits.get("caption_max", 280)
+        
+        if len(caption_text) > caption_limit:
+            caption_text = caption_text[:caption_limit]
+            last_space = caption_text.rfind(" ")
+            if last_space > 0:
+                caption_text = caption_text[:last_space]
+            caption_text = caption_text.strip()
+        
+        if len(caption_text) > caption_limit:
+            caption_text = caption_text[:caption_limit]
+        
+        results.append({
+            "text": caption_text,
+            "chars": len(caption_text),
+            "limit": caption_limit,
+            "title": var_title,
+        })
+        if var_title:
+            titles.append(var_title)
+    
+    for i, r in enumerate(results):
+        if i < len(titles):
+            r["title"] = titles[i]
+    
+    main_title = titles[0] if titles else None
+    
+    return {
+        "title": main_title,
+        "titles": titles,
+        "variation_type": length_type,
+        "platform": plat.get("name", platform),
+        "variations": results,
+    }
\ No newline at end of file
diff --git a/services/python-tools/tools/caption-generator/helper.py b/services/python-tools/tools/caption-generator/helper.py
new file mode 100644
index 0000000..ebe9a98
--- /dev/null
+++ b/services/python-tools/tools/caption-generator/helper.py
@@ -0,0 +1,25 @@
+from typing import List, Tuple
+
+def jaccard_similarity(text1: str, text2: str) -> float:
+    words1 = set(text1.lower().split())
+    words2 = set(text2.lower().split())
+
+    if not words1 or not words2:
+        return 0.0
+
+    intersection = words1.intersection(words2)
+    union = words1.union(words2)
+
+    return len(intersection) / len(union) if union else 0.0
+
+
+def check_variations(captions: List[str]) -> List[Tuple[int, int, float]]:
+    similarities = []
+    n = len(captions)
+
+    for i in range(n):
+        for j in range(i + 1, n):
+            sim = jaccard_similarity(captions[i], captions[j])
+            similarities.append((i, j, sim))
+
+    return similarities
\ No newline at end of file
diff --git a/services/python-tools/tools/caption-generator/requirements.txt b/services/python-tools/tools/caption-generator/requirements.txt
new file mode 100644
index 0000000..3ceaffc
--- /dev/null
+++ b/services/python-tools/tools/caption-generator/requirements.txt
@@ -0,0 +1 @@
+openai>=1.0.0
\ No newline at end of file
diff --git a/services/python-tools/tools/caption-generator/rules.py b/services/python-tools/tools/caption-generator/rules.py
new file mode 100644
index 0000000..280257c
--- /dev/null
+++ b/services/python-tools/tools/caption-generator/rules.py
@@ -0,0 +1,148 @@
+# platform-specific character limits and caption generation rules
+
+PLATFORM_LIMITS = {
+    "youtube": {
+        "name": "YouTube",
+        "caption_short_min": 80,
+        "caption_short_max": 150,
+        "caption_long_min": 150,
+        "caption_long_max": 200,
+        "title_max": 60,
+        "title_optional": True,
+        "hashtag_count": (3, 5),
+        "cta_patterns": ["Like and subscribe", "Let me know in comments", "Share your thoughts", "Don't forget to subscribe"],
+        "style": "engaging_informative",
+        "emoji_limit": (1, 3),
+    },
+    "youtube_shorts": {
+        "name": "YouTube Shorts",
+        "caption_short_min": 60,
+        "caption_short_max": 100,
+        "caption_long_min": 80,
+        "caption_long_max": 100,
+        "title_max": 40,
+        "title_optional": True,
+        "hashtag_count": (2, 3),
+        "cta_patterns": ["Follow for more", "Like if you enjoyed", "Share with friends"],
+        "style": "punchy_short_form",
+        "emoji_limit": (1, 2),
+    },
+    "tiktok": {
+        "name": "TikTok",
+        "caption_short_min": 50,
+        "caption_short_max": 80,
+        "caption_long_min": 100,
+        "caption_long_max": 150,
+        "title_max": 35,
+        "title_optional": False,
+        "hashtag_count": (3, 5),
+        "cta_patterns": ["Follow for more", "Duet this", "Share with friends", "Save this"],
+        "style": "trending_energetic",
+        "emoji_limit": (2, 5),
+    },
+    "instagram": {
+        "name": "Instagram",
+        "caption_short_min": 80,
+        "caption_short_max": 100,
+        "caption_long_min": 125,
+        "caption_long_max": 150,
+        "title_max": 0,
+        "title_optional": False,
+        "hashtag_count": (3, 5),
+        "cta_patterns": ["Double tap if you agree", "Tag someone", "Share with a friend", "Link in bio"],
+        "style": "visual_storytelling",
+        "emoji_limit": (3, 6),
+    },
+    "reddit": {
+        "name": "Reddit",
+        "title_short_min": 60,
+        "title_short_max": 120,
+        "title_long_min": 120,
+        "title_long_max": 200,
+        "caption_short_min": 500,
+        "caption_short_max": 1000,
+        "caption_long_min": 1000,
+        "caption_long_max": 2000,
+        "hashtag_count": (0, 0),
+        "cta_patterns": ["What do you think?", "Share your experience", "Comments welcome"],
+        "style": "authentic_community",
+        "emoji_limit": (0, 1),
+    },
+    "linkedin": {
+        "name": "LinkedIn",
+        "caption_short_min": 150,
+        "caption_short_max": 300,
+        "caption_long_min": 600,
+        "caption_long_max": 2000,
+        "title_max": 0,
+        "title_optional": False,
+        "hashtag_count": (3, 5),
+        "cta_patterns": ["What are your thoughts?", "Share your experience", "Let's connect", "Comments welcome"],
+        "style": "professional_thoughtful",
+        "emoji_limit": (0, 2),
+    },
+    "x_twitter": {
+        "name": "X (Twitter)",
+        "caption_short_min": 100,
+        "caption_short_max": 140,
+        "caption_long_min": 200,
+        "caption_long_max": 280,
+        "title_max": 0,
+        "title_optional": False,
+        "hashtag_count": (2, 3),
+        "cta_patterns": ["Quote this", "Repost", "Share your thoughts"],
+        "style": "concise_punchy",
+        "emoji_limit": (0, 1),
+    },
+}
+
+# Platform mapping for frontend values
+PLATFORM_KEYS = {
+    "youtube": "youtube",
+    "youtube_shorts": "youtube_shorts",
+    "tiktok": "tiktok",
+    "instagram": "instagram",
+    "reddit": "reddit",
+    "linkedin": "linkedin",
+    "x_twitter": "x_twitter",
+}
+
+# Reverse mapping from display names
+PLATFORM_REVERSE_MAP = {
+    "YouTube": "youtube",
+    "YouTube Shorts": "youtube_shorts",
+    "TikTok": "tiktok",
+    "Instagram": "instagram",
+    "Reddit": "reddit",
+    "LinkedIn": "linkedin",
+    "X (Twitter)": "x_twitter",
+    "X(Twitter)": "x_twitter",
+}
+
+def get_limits(platform: str, length_type: str) -> dict:
+    plat = PLATFORM_LIMITS.get(platform, PLATFORM_LIMITS["linkedin"])
+    is_short = length_type == "short"
+    
+    title_optional = plat.get("title_optional", True)
+    has_title = (plat.get("title_max", 0) > 0 and title_optional) or "title_short_min" in plat
+    is_title_only = "title_short_min" in plat
+    
+    if is_title_only:
+        return {
+            "title_min": plat["title_short_min"] if is_short else plat["title_long_min"],
+            "title_max": plat["title_short_max"] if is_short else plat["title_long_max"],
+            "caption_min": plat["caption_short_min"] if is_short else plat["caption_long_min"],
+            "caption_max": plat["caption_short_max"] if is_short else plat["caption_long_max"],
+        }
+    
+    if has_title:
+        return {
+            "caption_min": plat["caption_short_min"] if is_short else plat["caption_long_min"],
+            "caption_max": plat["caption_short_max"] if is_short else plat["caption_long_max"],
+            "title_max": plat["title_max"],
+        }
+    
+    return {
+        "caption_min": plat["caption_short_min"] if is_short else plat["caption_long_min"],
+        "caption_max": plat["caption_short_max"] if is_short else plat["caption_long_max"],
+    }
\ No newline at end of file
diff --git a/services/python-tools/tools/caption-generator/tool.py b/services/python-tools/tools/caption-generator/tool.py
new file mode 100644
index 0000000..c815156
--- /dev/null
+++ b/services/python-tools/tools/caption-generator/tool.py
@@ -0,0 +1,210 @@
+import os
+import re
+from openai import AsyncOpenAI
+import asyncio
+from generator import extra_text, generate_all_captions
+from rules import PLATFORM_LIMITS, PLATFORM_REVERSE_MAP
+from helper import check_variations
+
+MANIFEST = {
+    "id": "caption-generator",
+    "name": "Social Media Captions",
+    "description": "Generate platform-optimized captions with hashtag suggestions for YouTube, TikTok, Instagram, LinkedIn, Reddit, and X/Twitter.",
+    "author": "Oxlo Team",
+    "version": "3.0.0",
+}
+
+MAX_RETRIES = 2
+SIMILARITY_THRESHOLD = 0.70
+
+
+def normalize_platform(platform: str) -> str:
+    if not platform:
+        return "linkedin"
+    
+    platform_lower = platform.lower().strip()
+    
+    direct_map = {
+        "youtube": "youtube",
+        "youtube_shorts": "youtube_shorts",
+        "youtube shorts": "youtube_shorts",
+        "tiktok": "tiktok",
+        "instagram": "instagram",
+        "reddit": "reddit",
+        "linkedin": "linkedin",
+        "x": "x_twitter",
+        "x_twitter": "x_twitter",
+        "x(twitter)": "x_twitter",
+        "twitter": "x_twitter",
+    }
+    
+    if platform_lower in direct_map:
+        return direct_map[platform_lower]
+    
+    if platform in PLATFORM_REVERSE_MAP:
+        return PLATFORM_REVERSE_MAP[platform]
+    
+    for key in PLATFORM_LIMITS.keys():
+        if key in platform_lower or platform_lower in key:
+            return key
+    
+    return "linkedin"
+
+
+def validate_length_type(length_type: str) -> str:
+    if length_type and length_type.lower() in ["short", "long"]:
+        return length_type.lower()
+    return "short"
+
+
+async def run(data: dict) -> dict:
+    prompt = data.get("prompt", "")
+    platform_input = data.get("platform", "linkedin")
+    length_type_input = data.get("length_type", "short")
+    image_data = data.get("image", "")
+
+    import logging
+    logger = logging.getLogger("caption-generator")
+    logger.info(f"Received request: platform={platform_input}, length={length_type_input}, prompt_len={len(prompt)}, has_image={bool(image_data)}")
+    
+    platform = normalize_platform(platform_input)
+    length_type = validate_length_type(length_type_input)
+    
+    api_key = os.getenv("OXLO_API_KEY")
+
+    if not api_key:
+        logger.error("No OXLO_API_KEY set")
+        return {
+            "error": "Please enter your Oxlo API key.",
+            "result": None
+        }
+
+    if not prompt:
+        logger.error("Empty prompt")
+        return {
+            "error": "Please enter a prompt for captions.",
+            "result": None
+        }
+
+    words = prompt.strip().split()
+    if len(words) < 5:
+        logger.error(f"Prompt too short: {len(words)} words")
+        return {
+            "error": "Please describe your content in more detail. Add more information about what you want to share.",
+            "result": None
+        }
+    
+    random_patterns = ["asdf", "qwerty", "12345", "abc", "xxx", "yyy", "test", "ffff", "dddd"]
+    lower_prompt = prompt.lower()
+    if len(words) < 10 and any(p in lower_prompt for p in random_patterns):
+        logger.error("Random pattern detected in prompt")
+        return {
+            "error": "Please describe your content in more detail. Add more information about what you want to share.",
+            "result": None
+        }
+
+    client = AsyncOpenAI(
+        base_url="https://api.oxlo.ai/v1",
+        api_key=api_key,
+    )
+
+    context_from_image = ""
+    if image_data:
+        logger.info("Extracting text from image")
+        context_from_image = await extra_text(image_data, api_key)
+        logger.info(f"Image OCR result: {len(context_from_image)} chars")
+
+    plat_info = PLATFORM_LIMITS.get(platform, PLATFORM_LIMITS["linkedin"])
+    logger.info(f"Generating captions for {platform} ({length_type})")
+    
+    all_results = []
+    retry_count = 0
+    
+    while len(all_results) < 3 and retry_count <= MAX_RETRIES:
+        logger.info(f"Generation attempt {retry_count + 1}")
+        new_results = await asyncio.gather(
+            generate_all_captions(client, prompt, platform, length_type, context_from_image),
+            generate_all_captions(client, prompt, platform, length_type, context_from_image),
+            generate_all_captions(client, prompt, platform, length_type, context_from_image),
+        )
+        
+        new_variations = [r.get("variations", []) for r in new_results]
+        
+        flat_captions = []
+        for variation_set in new_variations:
+            for v in variation_set:
+                flat_captions.append(v.get("text", ""))
+        
+        if len(flat_captions) >= 3:
+            similarities = check_variations(flat_captions)
+            high_similarity_pairs = [(i, j, s) for i, j, s in similarities if s > SIMILARITY_THRESHOLD]
+            
+            if high_similarity_pairs and retry_count < MAX_RETRIES:
+                retry_count += 1
+                continue
+        
+        all_results = new_results
+        break
+
+    plat_name = plat_info.get("name", platform.capitalize())
+    
+    output_lines = []
+    
+    title = None
+    for r in all_results:
+        if r.get("title"):
+            t = r.get("title", "")
+            t = re.sub(r'\*\*', '', t)
+            t = re.sub(r'-{2,}', '-', t)
+            t = re.sub(r'—', '-', t)
+            t = t.strip()
+            title = t
+            break
+    
+    variations_output = []
+    for result in all_results:
+        for variation in result.get("variations", []):
+            text = variation.get("text", "")
+            if not text:
+                continue
+            text = re.sub(r'\*\*', '', text)
+            text = re.sub(r'-{2,}', '-', text)
+            text = re.sub(r'—', '-', text)
+            text = text.replace("\\n", "\n")
+            text = re.sub(r'^[#*>\s]+', '', text, flags=re.MULTILINE)
+            text = re.sub(r'\n{3,}', '\n\n', text)
+            text = text.strip()
+            if text:
+                variations_output.append({
+                    "text": text,
+                    "chars": len(text),
+                    "limit": variation.get("limit", 280),
+                    "title": variation.get("title", ""),
+                })
+    
+    logger.info(f"Title: {title}")
+    logger.info(f"Variations generated: {len(variations_output)}")
+    if variations_output:
+        logger.info(f"First variation chars: {variations_output[0]['chars']}")
+    
+    for i, v in enumerate(variations_output[:3], 1):
+        if v.get("title"):
+            output_lines.append(v["title"])
+            output_lines.append("")
+        output_lines.append(f"Variation {i} - Description:")
+        output_lines.append(v["text"])
+        output_lines.append(f"[{v['chars']}/{v['limit']} chars]")
+        output_lines.append("")
+
+    return {
+        "result": "\n".join(output_lines),
+        "metadata": {
+            "platform": platform,
+            "platform_name": plat_name,
+            "length_type": length_type,
+            "variation_type": length_type,
+            "has_image_context": bool(context_from_image),
+        },
+        "title": title,
+        "variations": variations_output[:3],
+    }
\ No newline at end of file