Full sync - all projects, memory, configs

2026-03-21 20:27:59 -05:00
parent 2447677d4a
commit b33de10902
395 changed files with 1635300 additions and 459211 deletions
--- a/tools/video-to-knowledge.py
+++ b/tools/video-to-knowledge.py
@ -0,0 +1,352 @@
+#!/usr/bin/env python3
+"""
+Video-to-Knowledge Pipeline
+===========================
+Extracts audio from video files, transcribes via Faster Whisper,
+generates a clean markdown document, and indexes into ChromaDB for RAG.
+
+Usage:
+  # Single video
+  python3 video-to-knowledge.py /path/to/video.mp4
+
+  # Directory (recursive)
+  python3 video-to-knowledge.py /path/to/course/
+
+  # Custom output dir
+  python3 video-to-knowledge.py /path/to/video.mp4 --output /path/to/output/
+
+  # Custom collection name
+  python3 video-to-knowledge.py /path/to/video.mp4 --collection "real-estate-course"
+
+  # Skip RAG indexing (just transcribe + markdown)
+  python3 video-to-knowledge.py /path/to/video.mp4 --no-rag
+
+  # Use a specific Whisper model size
+  python3 video-to-knowledge.py /path/to/video.mp4 --model large-v3
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import hashlib
+from pathlib import Path
+from datetime import timedelta
+
+# Defaults
+CHROMADB_HOST = "192.168.86.25"
+CHROMADB_PORT = 8000
+OLLAMA_HOST = "192.168.86.40"
+OLLAMA_PORT = 11434
+EMBED_MODEL = "nomic-embed-text"
+DEFAULT_COLLECTION = "video-knowledge"
+WHISPER_MODEL = "base.en"
+CHUNK_SIZE = 1000  # chars per RAG chunk
+CHUNK_OVERLAP = 200
+
+VIDEO_EXTENSIONS = {'.mp4', '.mkv', '.avi', '.mov', '.webm', '.flv', '.wmv', '.m4v', '.ts'}
+
+
+def log(msg):
+    print(f"  → {msg}")
+
+
+def extract_audio(video_path: Path, output_dir: Path) -> Path:
+    """Extract audio from video as WAV (16kHz mono for Whisper)."""
+    audio_path = output_dir / f"{video_path.stem}.wav"
+    if audio_path.exists():
+        log(f"Audio already extracted: {audio_path.name}")
+        return audio_path
+
+    log(f"Extracting audio from {video_path.name}...")
+    subprocess.run([
+        "ffmpeg", "-y", "-i", str(video_path),
+        "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
+        str(audio_path)
+    ], capture_output=True, check=True)
+    return audio_path
+
+
+def transcribe(audio_path: Path, model_size: str = WHISPER_MODEL) -> list[dict]:
+    """Transcribe audio using faster-whisper. Returns list of segments."""
+    log(f"Transcribing with faster-whisper ({model_size})...")
+    from faster_whisper import WhisperModel
+
+    model = WhisperModel(model_size, device="cpu", compute_type="int8")
+    segments_raw, info = model.transcribe(str(audio_path), beam_size=5)
+
+    segments = []
+    for seg in segments_raw:
+        segments.append({
+            "start": seg.start,
+            "end": seg.end,
+            "text": seg.text.strip()
+        })
+
+    log(f"Transcribed {len(segments)} segments, language: {info.language} ({info.language_probability:.0%})")
+    return segments
+
+
+def format_timestamp(seconds: float) -> str:
+    """Format seconds as HH:MM:SS."""
+    td = timedelta(seconds=int(seconds))
+    hours, remainder = divmod(td.seconds, 3600)
+    minutes, secs = divmod(remainder, 60)
+    if hours:
+        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+    return f"{minutes:02d}:{secs:02d}"
+
+
+def segments_to_markdown(segments: list[dict], video_name: str, video_path: str) -> str:
+    """Convert transcript segments into a clean, readable markdown document."""
+    lines = [
+        f"# {video_name}",
+        f"",
+        f"**Source:** `{video_path}`  ",
+        f"**Segments:** {len(segments)}  ",
+        f"**Duration:** {format_timestamp(segments[-1]['end']) if segments else 'N/A'}",
+        f"",
+        f"---",
+        f"",
+    ]
+
+    # Group segments into ~2 minute chapters for readability
+    chapter_duration = 120  # seconds
+    current_chapter_start = 0
+    chapter_num = 1
+    chapter_text = []
+
+    for seg in segments:
+        if seg["start"] >= current_chapter_start + chapter_duration and chapter_text:
+            # Write chapter
+            ts = format_timestamp(current_chapter_start)
+            lines.append(f"## [{ts}] Section {chapter_num}")
+            lines.append("")
+            lines.append(" ".join(chapter_text))
+            lines.append("")
+            chapter_num += 1
+            current_chapter_start = seg["start"]
+            chapter_text = []
+
+        chapter_text.append(seg["text"])
+
+    # Final chapter
+    if chapter_text:
+        ts = format_timestamp(current_chapter_start)
+        lines.append(f"## [{ts}] Section {chapter_num}")
+        lines.append("")
+        lines.append(" ".join(chapter_text))
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
+    """Split text into overlapping chunks for RAG indexing."""
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunk = text[start:end]
+        if chunk.strip():
+            chunks.append(chunk.strip())
+        start = end - overlap
+    return chunks
+
+
+def get_embedding(text: str) -> list[float]:
+    """Get embedding from Ollama."""
+    import requests
+    resp = requests.post(
+        f"http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/embed",
+        json={"model": EMBED_MODEL, "input": text},
+        timeout=30
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    # Handle both single and batch responses
+    if "embeddings" in data:
+        return data["embeddings"][0]
+    return data["embedding"]
+
+
+def index_to_chromadb(chunks: list[str], video_name: str, video_path: str, collection_name: str):
+    """Index text chunks into ChromaDB."""
+    import chromadb
+
+    log(f"Connecting to ChromaDB at {CHROMADB_HOST}:{CHROMADB_PORT}...")
+    client = chromadb.HttpClient(host=CHROMADB_HOST, port=CHROMADB_PORT)
+
+    collection = client.get_or_create_collection(
+        name=collection_name,
+        metadata={"hnsw:space": "cosine"}
+    )
+
+    # Generate a stable ID prefix from video path
+    video_hash = hashlib.md5(video_path.encode()).hexdigest()[:8]
+
+    log(f"Indexing {len(chunks)} chunks into collection '{collection_name}'...")
+    batch_size = 20
+    for i in range(0, len(chunks), batch_size):
+        batch = chunks[i:i + batch_size]
+        ids = [f"{video_hash}-{i + j}" for j in range(len(batch))]
+        embeddings = [get_embedding(chunk) for chunk in batch]
+        metadatas = [{
+            "source": video_path,
+            "video": video_name,
+            "chunk_index": i + j,
+            "total_chunks": len(chunks)
+        } for j in range(len(batch))]
+
+        collection.upsert(
+            ids=ids,
+            embeddings=embeddings,
+            documents=batch,
+            metadatas=metadatas
+        )
+        log(f"  Indexed {min(i + batch_size, len(chunks))}/{len(chunks)} chunks")
+
+    log(f"✅ Indexed into '{collection_name}' (total docs: {collection.count()})")
+
+
+def process_video(video_path: Path, output_dir: Path, collection: str,
+                  model_size: str, skip_rag: bool) -> dict:
+    """Full pipeline for a single video."""
+    video_name = video_path.stem
+    print(f"\n{'='*60}")
+    print(f"📹 Processing: {video_path.name}")
+    print(f"{'='*60}")
+
+    # Create output subdir mirroring source structure
+    vid_output = output_dir / video_name
+    vid_output.mkdir(parents=True, exist_ok=True)
+
+    # 1. Extract audio
+    audio_path = extract_audio(video_path, vid_output)
+
+    # 2. Transcribe
+    segments = transcribe(audio_path, model_size)
+
+    # 3. Save raw transcript JSON
+    transcript_path = vid_output / f"{video_name}_transcript.json"
+    with open(transcript_path, "w") as f:
+        json.dump(segments, f, indent=2)
+    log(f"Saved transcript: {transcript_path.name}")
+
+    # 4. Generate markdown
+    markdown = segments_to_markdown(segments, video_name, str(video_path))
+    md_path = vid_output / f"{video_name}.md"
+    with open(md_path, "w") as f:
+        f.write(markdown)
+    log(f"Saved markdown: {md_path.name}")
+
+    # 5. Index to RAG
+    if not skip_rag:
+        full_text = " ".join(seg["text"] for seg in segments)
+        chunks = chunk_text(full_text)
+        try:
+            index_to_chromadb(chunks, video_name, str(video_path), collection)
+        except Exception as e:
+            log(f"⚠️ RAG indexing failed: {e}")
+            log("Transcript and markdown were still saved successfully.")
+    else:
+        log("Skipping RAG indexing (--no-rag)")
+
+    # Clean up audio (large file)
+    if audio_path.exists():
+        audio_path.unlink()
+        log("Cleaned up extracted audio")
+
+    return {
+        "video": str(video_path),
+        "segments": len(segments),
+        "markdown": str(md_path),
+        "transcript": str(transcript_path)
+    }
+
+
+def find_videos(path: Path) -> list[Path]:
+    """Find all video files in path (recursive if directory)."""
+    if path.is_file():
+        if path.suffix.lower() in VIDEO_EXTENSIONS:
+            return [path]
+        else:
+            print(f"❌ Not a recognized video file: {path}")
+            return []
+
+    videos = []
+    for ext in VIDEO_EXTENSIONS:
+        videos.extend(path.rglob(f"*{ext}"))
+        videos.extend(path.rglob(f"*{ext.upper()}"))
+    videos = sorted(set(videos))
+    return videos
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Video-to-Knowledge Pipeline: Transcribe videos → Markdown + RAG",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument("input", help="Video file or directory to process")
+    parser.add_argument("--output", "-o", help="Output directory (default: ./video-knowledge-output/)")
+    parser.add_argument("--collection", "-c", default=DEFAULT_COLLECTION,
+                        help=f"ChromaDB collection name (default: {DEFAULT_COLLECTION})")
+    parser.add_argument("--model", "-m", default=WHISPER_MODEL,
+                        help=f"Whisper model size (default: {WHISPER_MODEL})")
+    parser.add_argument("--no-rag", action="store_true",
+                        help="Skip ChromaDB indexing (just transcribe + markdown)")
+    parser.add_argument("--chunk-size", type=int, default=CHUNK_SIZE,
+                        help=f"RAG chunk size in chars (default: {CHUNK_SIZE})")
+
+    args = parser.parse_args()
+
+    input_path = Path(args.input).resolve()
+    if not input_path.exists():
+        print(f"❌ Path not found: {input_path}")
+        sys.exit(1)
+
+    output_dir = Path(args.output) if args.output else Path("./video-knowledge-output")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    videos = find_videos(input_path)
+    if not videos:
+        print(f"❌ No video files found in: {input_path}")
+        sys.exit(1)
+
+    print(f"🎬 Found {len(videos)} video(s) to process")
+    print(f"📂 Output: {output_dir}")
+    print(f"🧠 Collection: {args.collection}")
+    print(f"🎙️ Whisper model: {args.model}")
+
+    results = []
+    for video in videos:
+        try:
+            result = process_video(video, output_dir, args.collection,
+                                   args.model, args.no_rag)
+            results.append(result)
+        except Exception as e:
+            print(f"❌ Failed on {video.name}: {e}")
+            results.append({"video": str(video), "error": str(e)})
+
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"✅ COMPLETE — {len([r for r in results if 'error' not in r])}/{len(results)} videos processed")
+    print(f"{'='*60}")
+    for r in results:
+        if "error" in r:
+            print(f"  ❌ {Path(r['video']).name}: {r['error']}")
+        else:
+            print(f"  ✅ {Path(r['video']).name}: {r['segments']} segments → {r['markdown']}")
+
+    # Save manifest
+    manifest_path = output_dir / "manifest.json"
+    with open(manifest_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\n📋 Manifest: {manifest_path}")
+
+
+if __name__ == "__main__":
+    main()