114 lines
4.1 KiB
Python
114 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import json
|
|
import os
|
|
import glob
|
|
import subprocess
|
|
import sys
|
|
|
|
def clean_text(text):
|
|
"""Clean and prepare text for JSON encoding"""
|
|
if not text:
|
|
return ""
|
|
# Remove any problematic characters and normalize
|
|
text = text.replace('\n', ' ') # Replace newlines with spaces
|
|
text = text.replace('\r', ' ') # Replace carriage returns
|
|
text = text.replace('\t', ' ') # Replace tabs
|
|
# Remove any escape sequences that might cause issues
|
|
text = ' '.join(text.split()) # Normalize whitespace
|
|
return text
|
|
|
|
def extract_pairs_from_session(file_path):
|
|
"""Extract user-assistant pairs from a session file"""
|
|
pairs = []
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
|
|
messages = []
|
|
for line in lines:
|
|
try:
|
|
data = json.loads(line.strip())
|
|
if data.get('type') == 'message':
|
|
messages.append(data)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
# Extract user-assistant pairs
|
|
for i in range(len(messages) - 1):
|
|
if (messages[i]['message']['role'] == 'user' and
|
|
i + 1 < len(messages) and
|
|
messages[i + 1]['message']['role'] == 'assistant'):
|
|
|
|
user_content = messages[i]['message']['content']
|
|
assistant_content = messages[i + 1]['message']['content']
|
|
|
|
# Extract text content
|
|
user_text = ''
|
|
assistant_text = ''
|
|
|
|
if isinstance(user_content, list):
|
|
for item in user_content:
|
|
if item.get('type') == 'text':
|
|
user_text += item.get('text', '')
|
|
elif isinstance(user_content, str):
|
|
user_text = user_content
|
|
|
|
if isinstance(assistant_content, list):
|
|
for item in assistant_content:
|
|
if item.get('type') == 'text':
|
|
assistant_text += item.get('text', '')
|
|
elif isinstance(assistant_content, str):
|
|
assistant_text = assistant_content
|
|
|
|
# Clean the texts
|
|
user_text = clean_text(user_text)
|
|
assistant_text = clean_text(assistant_text)
|
|
|
|
if user_text and assistant_text:
|
|
pairs.append({
|
|
'user': user_text,
|
|
'assistant': assistant_text,
|
|
'agent_id': 'case',
|
|
'session': 'main',
|
|
'timestamp': messages[i + 1]['timestamp']
|
|
})
|
|
except Exception as e:
|
|
print(f"Error processing {file_path}: {e}", file=sys.stderr)
|
|
|
|
return pairs
|
|
|
|
def main():
|
|
sessions_dir = "/home/wdjones/.openclaw/agents/main/sessions/"
|
|
|
|
# Get recent session files (sorted by modification time)
|
|
cmd = f"find {sessions_dir} -name '*.jsonl' -type f -printf '%T@ %p\\n' | sort -rn | head -10"
|
|
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
|
|
|
if result.returncode != 0:
|
|
print("Error finding session files", file=sys.stderr)
|
|
return
|
|
|
|
all_pairs = []
|
|
for line in result.stdout.strip().split('\n'):
|
|
if line:
|
|
parts = line.split(' ', 1)
|
|
if len(parts) == 2:
|
|
file_path = parts[1]
|
|
pairs = extract_pairs_from_session(file_path)
|
|
all_pairs.extend(pairs)
|
|
|
|
# Sort by timestamp and take last 10
|
|
all_pairs.sort(key=lambda x: x['timestamp'])
|
|
last_10_pairs = all_pairs[-10:]
|
|
|
|
print(f"Extracted {len(last_10_pairs)} assistant turns from recent sessions", file=sys.stderr)
|
|
|
|
# Output JSON for each pair
|
|
for pair in last_10_pairs:
|
|
# Remove timestamp for final output
|
|
output_pair = {k: v for k, v in pair.items() if k != 'timestamp'}
|
|
print(json.dumps(output_pair, ensure_ascii=True))
|
|
|
|
if __name__ == "__main__":
|
|
main() |