index_codebase/demo.py at main · sudipme/index_codebase · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
"""
Demo of the RAG system for code analysis.

This script demonstrates how to:
1. Parse Python code files
2. Generate embeddings
3. Store embeddings in the Qdrant vector database
4. Search for similar code using semantic search
5. Generate context for an LLM to answer code-related questions
"""
import os
import argparse
import json
from app.main import CodeEmbedder


def print_separator():
    """Print a separator line."""
    print("\n" + "=" * 80 + "\n")


def embed_code(embedder, code_path, force_update=False):
    """Embed code files or directories."""
    if os.path.isdir(code_path):
        print(f"Embedding directory: {code_path}")
        embedder.embed_directory(code_path, force_update=force_update)
    elif os.path.isfile(code_path):
        print(f"Embedding file: {code_path}")
        embedder.embed_file(code_path, force_update=force_update)
    else:
        print(f"Path not found: {code_path}")


def search_code(embedder, query, top_k=5):
    """Search for code segments related to a query."""
    print(f"Searching for code related to: '{query}'")
    print(f"Top {top_k} results:")
    results = embedder.search(query, top_k=top_k)

    for i, (code_id, similarity, code_text) in enumerate(results, 1):
        print(f"\nRESULT {i}:")
        print(f"ID: {code_id}")
        print(f"Similarity: {similarity:.4f}")
        # Print first few lines of code
        code_preview = "\n".join(code_text.split("\n")[:5])
        print(f"Code snippet (first few lines):\n{code_preview}")
        if len(code_text.split("\n")) > 5:
            print("...")

    return results


def get_llm_context(embedder, query, top_k=5):
    """Get context for an LLM to help answer code-related questions."""
    print(f"Generating LLM context for query: '{query}'")
    context = embedder.get_context_for_llm(query, top_k=top_k)
    return context


def main():
    """Run the demo."""
    parser = argparse.ArgumentParser(description="Code Embedding and RAG System Demo")
    subparsers = parser.add_subparsers(dest="command", help="Command to run")

    # Embed command
    embed_parser = subparsers.add_parser("embed", help="Embed code files")
    embed_parser.add_argument("path", help="Path to file or directory to embed")
    embed_parser.add_argument("--force", action="store_true", help="Force update existing embeddings")

    # Search command
    search_parser = subparsers.add_parser("search", help="Search for code")
    search_parser.add_argument("query", help="Search query")
    search_parser.add_argument("--top-k", type=int, default=5, help="Number of results to return")

    # Context command
    context_parser = subparsers.add_parser("context", help="Get LLM context for a query")
    context_parser.add_argument("query", help="Query to generate context for")
    context_parser.add_argument("--top-k", type=int, default=5, help="Number of results to include")

    # Demo command
    demo_parser = subparsers.add_parser("demo", help="Run a full demo")

    args = parser.parse_args()

    # Create a data directory for the database
    data_dir = os.path.join(os.path.dirname(__file__), "data")
    os.makedirs(data_dir, exist_ok=True)

    # Initialize the code embedder
    print("Initializing CodeEmbedder...")
    embedder = CodeEmbedder(data_dir=data_dir)

    if args.command == "embed":
        embed_code(embedder, args.path, args.force)

    elif args.command == "search":
        search_code(embedder, args.query, args.top_k)

    elif args.command == "context":
        context = get_llm_context(embedder, args.query, args.top_k)
        print("\nContext for LLM:")
        print(context[:1000] + "..." if len(context) > 1000 else context)

    elif args.command == "demo" or args.command is None:
        # Run a complete demo
        print_separator()
        print("STEP 1: Embedding code")

        # Embed application code
        app_dir = os.path.join(os.path.dirname(__file__), "app")
        embed_code(embedder, app_dir)

        # Embed test code
        test_dir = os.path.join(os.path.dirname(__file__), "test")
        embed_code(embedder, test_dir)

        # Run some example searches
        print_separator()
        print("STEP 2: Searching for code")

        queries = [
            "custom calculation",
            "database initialization",
            "embed Python files"
        ]

        for query in queries:
            print_separator()
            search_code(embedder, query, top_k=2)

        # Generate context for an LLM
        print_separator()
        print("STEP 3: Generating context for an LLM")

        query = "how does the arithmetic calculator work?"
        context = get_llm_context(embedder, query, top_k=3)

        print("\nContext for LLM:")
        print(context[:500] + "..." if len(context) > 500 else context)

        # Show example LLM prompt
        print_separator()
        print("Example LLM prompt with context:")
        prompt = f"""
You are a helpful coding assistant. Use the following code context to answer the user's question.

CONTEXT:
{context}

USER QUESTION:
{query}
"""
        print(prompt[:1000] + "..." if len(prompt) > 1000 else prompt)

        print_separator()
        print("Demo completed successfully!")
        print("Try running other commands:")
        print("  python demo.py embed <path>")
        print("  python demo.py search \"your query\"")
        print("  python demo.py context \"your query\"")


if __name__ == "__main__":
    main()