108 lines
3.6 KiB
Python
108 lines
3.6 KiB
Python
#!/usr/bin/env python3
|
|
import json
|
|
import sys
|
|
import os
|
|
import glob
|
|
from pathlib import Path
|
|
|
|
def parse_jsonl_file(file_path):
|
|
"""Parse a JSONL session file and extract user/assistant message pairs"""
|
|
pairs = []
|
|
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
|
|
# Parse each line as JSON
|
|
messages = []
|
|
for line in lines:
|
|
if line.strip():
|
|
try:
|
|
data = json.loads(line.strip())
|
|
if data.get("type") == "message":
|
|
messages.append(data)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
# Extract user/assistant pairs
|
|
i = 0
|
|
while i < len(messages):
|
|
msg = messages[i]
|
|
role = msg.get("message", {}).get("role")
|
|
|
|
if role == "user":
|
|
user_content = extract_text_content(msg["message"]["content"])
|
|
|
|
# Look for the next assistant response
|
|
if i + 1 < len(messages):
|
|
next_msg = messages[i + 1]
|
|
if next_msg.get("message", {}).get("role") == "assistant":
|
|
assistant_content = extract_text_content(next_msg["message"]["content"])
|
|
|
|
# Skip auto-memory indexer cron jobs
|
|
if not user_content.startswith("[cron:") or "auto-memory-indexer" not in user_content:
|
|
pairs.append({
|
|
"user": user_content,
|
|
"assistant": assistant_content,
|
|
"agent_id": "case",
|
|
"session": "main"
|
|
})
|
|
i += 2 # Skip both messages
|
|
else:
|
|
i += 1
|
|
else:
|
|
i += 1
|
|
else:
|
|
i += 1
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {file_path}: {e}", file=sys.stderr)
|
|
|
|
return pairs
|
|
|
|
def extract_text_content(content_array):
|
|
"""Extract text from content array, skipping thinking blocks"""
|
|
text_parts = []
|
|
for item in content_array:
|
|
if item.get("type") == "text":
|
|
text_parts.append(item.get("text", ""))
|
|
elif item.get("type") == "toolCall":
|
|
# Include tool calls in a simplified format
|
|
tool_name = item.get("name", "unknown")
|
|
text_parts.append(f"[Used tool: {tool_name}]")
|
|
|
|
return " ".join(text_parts).strip()
|
|
|
|
def find_session_files(sessions_dir):
|
|
"""Find all session files ordered by modification time"""
|
|
pattern = os.path.join(sessions_dir, "*.jsonl")
|
|
files = glob.glob(pattern)
|
|
# Sort by modification time, newest first
|
|
return sorted(files, key=lambda x: os.path.getmtime(x), reverse=True)
|
|
|
|
def main():
|
|
sessions_dir = "/home/wdjones/.openclaw/agents/main/sessions/"
|
|
|
|
# Find all session files
|
|
session_files = find_session_files(sessions_dir)
|
|
|
|
all_pairs = []
|
|
|
|
# Process session files until we have at least 10 assistant turns
|
|
for session_file in session_files:
|
|
pairs = parse_jsonl_file(session_file)
|
|
all_pairs.extend(pairs)
|
|
|
|
# Stop when we have enough pairs
|
|
if len(all_pairs) >= 10:
|
|
break
|
|
|
|
# Take the last 10 pairs
|
|
last_10_pairs = all_pairs[-10:]
|
|
|
|
# Output each pair as JSON to stdout
|
|
for pair in last_10_pairs:
|
|
print(json.dumps(pair))
|
|
|
|
if __name__ == "__main__":
|
|
main() |