69 lines
2.5 KiB
Python
69 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
import json
|
|
import sys
|
|
from collections import deque
|
|
|
|
def extract_last_assistant_turns(file_path, num_turns=10):
|
|
"""Extract last N assistant turns with their preceding user messages."""
|
|
messages = []
|
|
|
|
with open(file_path, 'r') as f:
|
|
for line in f:
|
|
try:
|
|
data = json.loads(line.strip())
|
|
if data.get('type') == 'message':
|
|
message_data = data.get('message', {})
|
|
role = message_data.get('role')
|
|
if role in ['user', 'assistant']:
|
|
content = message_data.get('content', [])
|
|
if isinstance(content, list) and content:
|
|
text = ''
|
|
for item in content:
|
|
if item.get('type') == 'text':
|
|
text += item.get('text', '')
|
|
elif isinstance(content, str):
|
|
text = content
|
|
else:
|
|
text = str(content)
|
|
|
|
messages.append({
|
|
'role': role,
|
|
'content': text,
|
|
'timestamp': data.get('timestamp'),
|
|
'id': data.get('id')
|
|
})
|
|
except (json.JSONDecodeError, KeyError) as e:
|
|
continue
|
|
|
|
# Find assistant turns with their preceding user messages
|
|
turns = []
|
|
for i in range(len(messages)):
|
|
if messages[i]['role'] == 'assistant':
|
|
# Look for the most recent user message before this assistant message
|
|
user_msg = None
|
|
for j in range(i-1, -1, -1):
|
|
if messages[j]['role'] == 'user':
|
|
user_msg = messages[j]['content']
|
|
break
|
|
|
|
if user_msg:
|
|
turns.append({
|
|
'user': user_msg,
|
|
'assistant': messages[i]['content'],
|
|
'agent_id': 'case',
|
|
'session': 'main'
|
|
})
|
|
|
|
# Return last N turns
|
|
return turns[-num_turns:] if len(turns) >= num_turns else turns
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python3 extract_turns.py <jsonl_file>")
|
|
sys.exit(1)
|
|
|
|
file_path = sys.argv[1]
|
|
turns = extract_last_assistant_turns(file_path)
|
|
|
|
for turn in turns:
|
|
print(json.dumps(turn)) |