workspace/fixed_extract_assistant_turns.py

#!/usr/bin/env python3

import json
import os
import glob
import subprocess
import sys

def clean_text(text):
    """Clean and prepare text for JSON encoding"""
    if not text:
        return ""
    # Remove any problematic characters and normalize
    text = text.replace('\n', ' ')  # Replace newlines with spaces
    text = text.replace('\r', ' ')  # Replace carriage returns
    text = text.replace('\t', ' ')  # Replace tabs
    # Remove any escape sequences that might cause issues
    text = ' '.join(text.split())   # Normalize whitespace
    return text

def extract_pairs_from_session(file_path):
    """Extract user-assistant pairs from a session file"""
    pairs = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        messages = []
        for line in lines:
            try:
                data = json.loads(line.strip())
                if data.get('type') == 'message':
                    messages.append(data)
            except json.JSONDecodeError:
                continue

        # Extract user-assistant pairs
        for i in range(len(messages) - 1):
            if (messages[i]['message']['role'] == 'user' and
                i + 1 < len(messages) and
                messages[i + 1]['message']['role'] == 'assistant'):

                user_content = messages[i]['message']['content']
                assistant_content = messages[i + 1]['message']['content']

                # Extract text content
                user_text = ''
                assistant_text = ''

                if isinstance(user_content, list):
                    for item in user_content:
                        if item.get('type') == 'text':
                            user_text += item.get('text', '')
                elif isinstance(user_content, str):
                    user_text = user_content

                if isinstance(assistant_content, list):
                    for item in assistant_content:
                        if item.get('type') == 'text':
                            assistant_text += item.get('text', '')
                elif isinstance(assistant_content, str):
                    assistant_text = assistant_content

                # Clean the texts
                user_text = clean_text(user_text)
                assistant_text = clean_text(assistant_text)

                if user_text and assistant_text:
                    pairs.append({
                        'user': user_text,
                        'assistant': assistant_text,
                        'agent_id': 'case',
                        'session': 'main',
                        'timestamp': messages[i + 1]['timestamp']
                    })
    except Exception as e:
        print(f"Error processing {file_path}: {e}", file=sys.stderr)

    return pairs

def main():
    sessions_dir = "/home/wdjones/.openclaw/agents/main/sessions/"

    # Get recent session files (sorted by modification time)
    cmd = f"find {sessions_dir} -name '*.jsonl' -type f -printf '%T@ %p\\n' | sort -rn | head -10"
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)

    if result.returncode != 0:
        print("Error finding session files", file=sys.stderr)
        return

    all_pairs = []
    for line in result.stdout.strip().split('\n'):
        if line:
            parts = line.split(' ', 1)
            if len(parts) == 2:
                file_path = parts[1]
                pairs = extract_pairs_from_session(file_path)
                all_pairs.extend(pairs)

    # Sort by timestamp and take last 10
    all_pairs.sort(key=lambda x: x['timestamp'])
    last_10_pairs = all_pairs[-10:]

    print(f"Extracted {len(last_10_pairs)} assistant turns from recent sessions", file=sys.stderr)

    # Output JSON for each pair
    for pair in last_10_pairs:
        # Remove timestamp for final output
        output_pair = {k: v for k, v in pair.items() if k != 'timestamp'}
        print(json.dumps(output_pair, ensure_ascii=True))

if __name__ == "__main__":
    main()