Full sync - all projects, memory, configs
This commit is contained in:
108
tools/extract_assistant_turns.py
Normal file
108
tools/extract_assistant_turns.py
Normal file
@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from pathlib import Path
|
||||
|
||||
def parse_jsonl_file(file_path):
|
||||
"""Parse a JSONL session file and extract user/assistant message pairs"""
|
||||
pairs = []
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
# Parse each line as JSON
|
||||
messages = []
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
try:
|
||||
data = json.loads(line.strip())
|
||||
if data.get("type") == "message":
|
||||
messages.append(data)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Extract user/assistant pairs
|
||||
i = 0
|
||||
while i < len(messages):
|
||||
msg = messages[i]
|
||||
role = msg.get("message", {}).get("role")
|
||||
|
||||
if role == "user":
|
||||
user_content = extract_text_content(msg["message"]["content"])
|
||||
|
||||
# Look for the next assistant response
|
||||
if i + 1 < len(messages):
|
||||
next_msg = messages[i + 1]
|
||||
if next_msg.get("message", {}).get("role") == "assistant":
|
||||
assistant_content = extract_text_content(next_msg["message"]["content"])
|
||||
|
||||
# Skip auto-memory indexer cron jobs
|
||||
if not user_content.startswith("[cron:") or "auto-memory-indexer" not in user_content:
|
||||
pairs.append({
|
||||
"user": user_content,
|
||||
"assistant": assistant_content,
|
||||
"agent_id": "case",
|
||||
"session": "main"
|
||||
})
|
||||
i += 2 # Skip both messages
|
||||
else:
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
else:
|
||||
i += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {e}", file=sys.stderr)
|
||||
|
||||
return pairs
|
||||
|
||||
def extract_text_content(content_array):
|
||||
"""Extract text from content array, skipping thinking blocks"""
|
||||
text_parts = []
|
||||
for item in content_array:
|
||||
if item.get("type") == "text":
|
||||
text_parts.append(item.get("text", ""))
|
||||
elif item.get("type") == "toolCall":
|
||||
# Include tool calls in a simplified format
|
||||
tool_name = item.get("name", "unknown")
|
||||
text_parts.append(f"[Used tool: {tool_name}]")
|
||||
|
||||
return " ".join(text_parts).strip()
|
||||
|
||||
def find_session_files(sessions_dir):
|
||||
"""Find all session files ordered by modification time"""
|
||||
pattern = os.path.join(sessions_dir, "*.jsonl")
|
||||
files = glob.glob(pattern)
|
||||
# Sort by modification time, newest first
|
||||
return sorted(files, key=lambda x: os.path.getmtime(x), reverse=True)
|
||||
|
||||
def main():
|
||||
sessions_dir = "/home/wdjones/.openclaw/agents/main/sessions/"
|
||||
|
||||
# Find all session files
|
||||
session_files = find_session_files(sessions_dir)
|
||||
|
||||
all_pairs = []
|
||||
|
||||
# Process session files until we have at least 10 assistant turns
|
||||
for session_file in session_files:
|
||||
pairs = parse_jsonl_file(session_file)
|
||||
all_pairs.extend(pairs)
|
||||
|
||||
# Stop when we have enough pairs
|
||||
if len(all_pairs) >= 10:
|
||||
break
|
||||
|
||||
# Take the last 10 pairs
|
||||
last_10_pairs = all_pairs[-10:]
|
||||
|
||||
# Output each pair as JSON to stdout
|
||||
for pair in last_10_pairs:
|
||||
print(json.dumps(pair))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user