airmageddon/monitor.py at main · RamonBeast/airmageddon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import argparse
import time
import json
from datetime import datetime
from sentinel import Sentinel
from florence import Florence
from PIL import Image
from utils.logger import Logger
from yolo.yolo_monitor import YoloMonitor
from utils.functions import LLMFunctions
from utils.listener import EventPublisher
from utils.configuration import Configuration

conf = Configuration()

def save_detection(image: Image, caption: str, response: str, act: bool):
    # This only has 1s resolution
    filename = datetime.now().strftime("%Y%m%d-%H%M%S")
    save_path = os.path.join(conf.get_config_param('captures_folder'), filename)

    resp = {
        'image': filename,
        'caption': caption,
        'response': response,
        'act': act
    }

    image.save(save_path + '.png')
    json.dump(resp, open(save_path + '.json', 'w'))

def main():
    parser = argparse.ArgumentParser(description='AIrmageddon')
    parser.add_argument('--source', default=conf.get_config_param('camera_feed'), help='Feed source (rtsp, video file, directory, single image)')
    parser.add_argument('--max-memories', default=conf.get_config_param('guard_max_memories'), help='Maximum number of memories to pass to the agents')
    parser.add_argument('--min-frame-similarity', default=conf.get_config_param('min_frame_similarity'), help='Minimum frame similarity to trigger a new frame capture')
    parser.add_argument('--frame-capture-interval', default=conf.get_config_param('frame_capture_interval'), help='Interval between frame captures')
    args = parser.parse_args()

    Logger.info('Loading models...')

    video = YoloMonitor(source=args.source)
    video.warmup()
    Logger.info('Yolo loaded')

    florence = Florence()
    Logger.info('Florence is ready')

    sentinel = Sentinel(max_memories=int(args.max_memories))
    Logger.info('Sentinel is active')

    llm_func = LLMFunctions()

    # Let's notify that we are starting our monitoring
    memory = EventPublisher()
    memory.create_memory('MonitoringStarted', True)

    f = []
    prev_frame = 0
    threshold = float(args.min_frame_similarity)
    frame_capture_interval = float(args.frame_capture_interval)
    triggers = conf.get_config_param('triggers')

    # Monitoring loop
    while True:
        t0 = time.time()

        try:
            # Extract detections and features from each frame
            detections, feats = video.run()

            if len(f) == 0:
                f = feats
                continue

            # Throttle video capture (only for cameras and streams)
            if video.webcam and t0 - prev_frame < frame_capture_interval:
                continue

            prev_frame = time.time()

            sim = video.get_similarity(f, feats)
            f = feats

            # First frame is not compared against anything
            if sim < 0:
                Logger.info('Discarding frame, it was either the first or the resolution has been adjusted')
                continue

            # Check if image similarity hit the threshold
            if sim >= threshold and not any(trigger in detections for trigger in triggers):
                #Logger.info(f'Skipping, similarity: {sim:0.2f}, detections: {detections}', ts=True)
                continue

            # From here on, we start reasoning on the image itself
            Logger.warning(f'Change detected, similarity: {sim:0.2f}, detections: {detections}', ts=True)

            image = video.get_image()

            if isinstance(image, Image.Image) == False:
                image = Image.fromarray(image)
                image = image.convert('RGB')

            # Pass the image to Florence
            frame_caption = florence.process_frame('<MORE_DETAILED_CAPTION>', image=image)

            if '<MORE_DETAILED_CAPTION>' in frame_caption:
                caption = frame_caption['<MORE_DETAILED_CAPTION>']
            else:
                Logger.warning(f'Florence did not return a caption, skipping frame')
                continue

            # Start the loop between the Guard and the Ex-Burglar
            response = sentinel.analyze_feed(caption)

            if response is None:
                Logger.error('Sentinel cannot analyze feed, terminating')
                return None

            if (tokens := sentinel.get_cumulative_tokens()) is not None:
                Logger.notify(f'[$] Cumulative tokens - prompt: {tokens["prompt_tokens"]}, completion: {tokens["completion_tokens"]}', ts=True)

            if (func_name := llm_func.is_function_call(response)) != None:
                save_detection(image, caption, response, False)
                memory.create_memory(caption, func_name)
                llm_func.call_class_function(llm_func, response)
            else:
                pass
                #Logger.info('No dangers detected, continuing monitoring')
                #save_detection(image, caption, response, True)
        except StopIteration:
            break

        t1 = time.time()
        Logger.info(f'Frame processed in {t1 - t0:.6f}s')

if __name__ == "__main__":
    main()