Trrafic-Sign-and-Pothole_De.../backend/pipelines/video_processor.py

import cv2
import time
from collections import defaultdict
from backend.models.yolo_manager import YOLOManager
from backend.models.clip_manager import CLIPManager
from backend.utils.image_utils import is_blurry, crop_image, convert_cv2_to_pil

class VideoProcessor:
    def __init__(self, yolo_model_path="yolov8n.pt", clip_model_id="openai/clip-vit-base-patch32"):
        self.yolo = YOLOManager(yolo_model_path)
        self.clip = CLIPManager(clip_model_id)

        # Buffer to store the best shot for each track ID
        # Format: {track_id: {'crop': np.array, 'area': float, 'frame_idx': int, 'bbox': list}}
        self.active_tracks = {}

        # Store final results
        self.final_results = []

        # CLIP Candidates
        self.pothole_labels = ["pothole", "shadow", "patch work", "manhole", "road crack"]
        self.sign_labels = ["stop sign", "yield sign", "speed limit 30", "speed limit 40", "speed limit 50", "speed limit 60", "pedestrian crossing", "no u-turn", "traffic light", "keep right"]

        # Frame counter
        self.frame_count = 0

    def process_video(self, video_path: str):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Error opening video: {video_path}")
            return []

        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        print(f"Processing video {video_path} ({width}x{height})...")

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            self.frame_count += 1

            # 1. Run YOLO Tracking
            results = self.yolo.track(frame)

            if results.boxes is None or results.boxes.id is None:
                continue

            boxes = results.boxes.xyxy.cpu().numpy()
            track_ids = results.boxes.id.cpu().numpy()
            class_ids = results.boxes.cls.cpu().numpy() # 0, 1, 2 depending on training

            current_frame_ids = set()

            for box, track_id, cls in zip(boxes, track_ids, class_ids):
                track_id = int(track_id)
                current_frame_ids.add(track_id)

                x1, y1, x2, y2 = box
                w_box = x2 - x1
                h_box = y2 - y1
                area = w_box * h_box

                # Check if this is the "best shot" so far
                if track_id not in self.active_tracks:
                    self.active_tracks[track_id] = {
                        'crop': crop_image(frame, box),
                        'area': area,
                        'frame_idx': self.frame_count,
                        'class_id': int(cls),
                        'bbox': box,
                        'processed': False
                    }
                else:
                    # Update if bigger area and not processed yet
                    if area > self.active_tracks[track_id]['area'] and not self.active_tracks[track_id]['processed']:
                        self.active_tracks[track_id].update({
                            'crop': crop_image(frame, box),
                            'area': area,
                            'frame_idx': self.frame_count,
                            'bbox': box
                        })

                # Trigger Classification if object is near the edge (leaving frame)
                # Margin of 50 pixels
                if x1 < 50 or y1 < 50 or x2 > width - 50 or y2 > height - 50:
                    self._classify_and_store(track_id)

            # Cleanup tracks that are no longer present (Exited frame)
            # Identify tracks in self.active_tracks that are NOT in current_frame_ids
            # We need to be careful not to classify already processed/removed tracks
            # But here we execute the "Trigger A"

            # Simple approach: Check all active tracks. If a track was seen recently but not now, assume it left?
            # Better approach for simplicity: We do classification on 'processed' flag or when finalizing.
            # Real ByteTrack keeps tracks 'lost' for some frames.
            # Here we will iterate existing keys and if not in current_frame, classify.

            # Cleanup tracks that are no longer present (Exited frame)
            # We iterate over a copy of keys to avoid RuntimeError
            for tid in list(self.active_tracks.keys()):
                if tid not in current_frame_ids:
                    # It's gone from view (or mostly gone)
                    self._classify_and_store(tid)
                    # Remove from active tracks to save memory
                    if self.active_tracks[tid].get('processed'):
                         del self.active_tracks[tid]

        cap.release()

        # Process any remaining tracks
        for tid in list(self.active_tracks.keys()):
            self._classify_and_store(tid)

        print("Processing complete.")
        return self.final_results

    def _classify_and_store(self, track_id):
        track_data = self.active_tracks.get(track_id)
        if not track_data or track_data.get('processed'):
            return

        crop = track_data['crop']

        # Blur check - if too blurry, maybe skip or mark low confidence?
        # For now, we process anyway but could log it.
        # if is_blurry(crop): ...

        # Prepare for CLIP
        pil_image = convert_cv2_to_pil(crop)

        # Deciding which labels to use based on YOLO class
        # Assuming YOLO classes: 0=Sign, 1=Pothole/Manhole (Just an example schema)
        # You'd need to map this to your specific training.
        # For this logic, let's try both or fallback.

        # Strategy: Classify against ALL relevant labels to be safe?
        # Or split if we trust YOLO class.
        # Let's trust YOLO class if available.
        # For this template, I will simply check against both lists and take the max confidence one.

        candidates = self.sign_labels + self.pothole_labels
        best_label, score = self.clip.get_best_match(pil_image, candidates, threshold=0.5)

        obj_type = "Traffic Sign" if best_label in self.sign_labels else "Road Damage"

        result = {
            "id": track_id,
            "type": obj_type,
            "subtype": best_label,
            "confidence": float(score),
            "frame_idx": track_data['frame_idx'],
             # In a real app, you might save the crop to disk and return a URL
             # "crop_path": save_to_disk...
        }

        self.final_results.append(result)
        self.active_tracks[track_id]['processed'] = True

if __name__ == "__main__":
    # Test run
    processor = VideoProcessor()
    # processor.process_video("test_video.mp4")