Full sync - all projects, memory, configs
This commit is contained in:
114
fixed_extract_assistant_turns.py
Normal file
114
fixed_extract_assistant_turns.py
Normal file
@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import os
|
||||
import glob
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
def clean_text(text):
|
||||
"""Clean and prepare text for JSON encoding"""
|
||||
if not text:
|
||||
return ""
|
||||
# Remove any problematic characters and normalize
|
||||
text = text.replace('\n', ' ') # Replace newlines with spaces
|
||||
text = text.replace('\r', ' ') # Replace carriage returns
|
||||
text = text.replace('\t', ' ') # Replace tabs
|
||||
# Remove any escape sequences that might cause issues
|
||||
text = ' '.join(text.split()) # Normalize whitespace
|
||||
return text
|
||||
|
||||
def extract_pairs_from_session(file_path):
|
||||
"""Extract user-assistant pairs from a session file"""
|
||||
pairs = []
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
messages = []
|
||||
for line in lines:
|
||||
try:
|
||||
data = json.loads(line.strip())
|
||||
if data.get('type') == 'message':
|
||||
messages.append(data)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Extract user-assistant pairs
|
||||
for i in range(len(messages) - 1):
|
||||
if (messages[i]['message']['role'] == 'user' and
|
||||
i + 1 < len(messages) and
|
||||
messages[i + 1]['message']['role'] == 'assistant'):
|
||||
|
||||
user_content = messages[i]['message']['content']
|
||||
assistant_content = messages[i + 1]['message']['content']
|
||||
|
||||
# Extract text content
|
||||
user_text = ''
|
||||
assistant_text = ''
|
||||
|
||||
if isinstance(user_content, list):
|
||||
for item in user_content:
|
||||
if item.get('type') == 'text':
|
||||
user_text += item.get('text', '')
|
||||
elif isinstance(user_content, str):
|
||||
user_text = user_content
|
||||
|
||||
if isinstance(assistant_content, list):
|
||||
for item in assistant_content:
|
||||
if item.get('type') == 'text':
|
||||
assistant_text += item.get('text', '')
|
||||
elif isinstance(assistant_content, str):
|
||||
assistant_text = assistant_content
|
||||
|
||||
# Clean the texts
|
||||
user_text = clean_text(user_text)
|
||||
assistant_text = clean_text(assistant_text)
|
||||
|
||||
if user_text and assistant_text:
|
||||
pairs.append({
|
||||
'user': user_text,
|
||||
'assistant': assistant_text,
|
||||
'agent_id': 'case',
|
||||
'session': 'main',
|
||||
'timestamp': messages[i + 1]['timestamp']
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path}: {e}", file=sys.stderr)
|
||||
|
||||
return pairs
|
||||
|
||||
def main():
|
||||
sessions_dir = "/home/wdjones/.openclaw/agents/main/sessions/"
|
||||
|
||||
# Get recent session files (sorted by modification time)
|
||||
cmd = f"find {sessions_dir} -name '*.jsonl' -type f -printf '%T@ %p\\n' | sort -rn | head -10"
|
||||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
||||
|
||||
if result.returncode != 0:
|
||||
print("Error finding session files", file=sys.stderr)
|
||||
return
|
||||
|
||||
all_pairs = []
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line:
|
||||
parts = line.split(' ', 1)
|
||||
if len(parts) == 2:
|
||||
file_path = parts[1]
|
||||
pairs = extract_pairs_from_session(file_path)
|
||||
all_pairs.extend(pairs)
|
||||
|
||||
# Sort by timestamp and take last 10
|
||||
all_pairs.sort(key=lambda x: x['timestamp'])
|
||||
last_10_pairs = all_pairs[-10:]
|
||||
|
||||
print(f"Extracted {len(last_10_pairs)} assistant turns from recent sessions", file=sys.stderr)
|
||||
|
||||
# Output JSON for each pair
|
||||
for pair in last_10_pairs:
|
||||
# Remove timestamp for final output
|
||||
output_pair = {k: v for k, v in pair.items() if k != 'timestamp'}
|
||||
print(json.dumps(output_pair, ensure_ascii=True))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user