Files
workspace/tools/extract_assistant_turns.py

108 lines
3.6 KiB
Python

#!/usr/bin/env python3
import json
import sys
import os
import glob
from pathlib import Path
def parse_jsonl_file(file_path):
"""Parse a JSONL session file and extract user/assistant message pairs"""
pairs = []
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# Parse each line as JSON
messages = []
for line in lines:
if line.strip():
try:
data = json.loads(line.strip())
if data.get("type") == "message":
messages.append(data)
except json.JSONDecodeError:
continue
# Extract user/assistant pairs
i = 0
while i < len(messages):
msg = messages[i]
role = msg.get("message", {}).get("role")
if role == "user":
user_content = extract_text_content(msg["message"]["content"])
# Look for the next assistant response
if i + 1 < len(messages):
next_msg = messages[i + 1]
if next_msg.get("message", {}).get("role") == "assistant":
assistant_content = extract_text_content(next_msg["message"]["content"])
# Skip auto-memory indexer cron jobs
if not user_content.startswith("[cron:") or "auto-memory-indexer" not in user_content:
pairs.append({
"user": user_content,
"assistant": assistant_content,
"agent_id": "case",
"session": "main"
})
i += 2 # Skip both messages
else:
i += 1
else:
i += 1
else:
i += 1
except Exception as e:
print(f"Error processing {file_path}: {e}", file=sys.stderr)
return pairs
def extract_text_content(content_array):
"""Extract text from content array, skipping thinking blocks"""
text_parts = []
for item in content_array:
if item.get("type") == "text":
text_parts.append(item.get("text", ""))
elif item.get("type") == "toolCall":
# Include tool calls in a simplified format
tool_name = item.get("name", "unknown")
text_parts.append(f"[Used tool: {tool_name}]")
return " ".join(text_parts).strip()
def find_session_files(sessions_dir):
"""Find all session files ordered by modification time"""
pattern = os.path.join(sessions_dir, "*.jsonl")
files = glob.glob(pattern)
# Sort by modification time, newest first
return sorted(files, key=lambda x: os.path.getmtime(x), reverse=True)
def main():
sessions_dir = "/home/wdjones/.openclaw/agents/main/sessions/"
# Find all session files
session_files = find_session_files(sessions_dir)
all_pairs = []
# Process session files until we have at least 10 assistant turns
for session_file in session_files:
pairs = parse_jsonl_file(session_file)
all_pairs.extend(pairs)
# Stop when we have enough pairs
if len(all_pairs) >= 10:
break
# Take the last 10 pairs
last_10_pairs = all_pairs[-10:]
# Output each pair as JSON to stdout
for pair in last_10_pairs:
print(json.dumps(pair))
if __name__ == "__main__":
main()