salesforce-appraiser-review.../scripts/reconstruct_and_inspect.py

101 lines
3.7 KiB
Python

#!/usr/bin/env python3
import os, re, base64, zipfile, xml.etree.ElementTree as ET, sys
log_path = '/home/paulh/.vscode-server/data/User/workspaceStorage/79b924110cb5ff6de49811d445e59969-1/GitHub.copilot-chat/chat-session-resources/90e6dae0-2184-412b-af0b-eac258be98c5/call_gY8uUyzuGvFiN46d4ZPVFtjz__vscode-1775271381281/content.txt'
out_dir = 'artifacts/doc_inspect'
os.makedirs(out_dir, exist_ok=True)
start=False
chunks = []
with open(log_path, 'r', errors='replace') as f:
for line in f:
if 'BASE64_BEGIN' in line:
start = True
continue
if 'BASE64_END' in line:
break
if start:
if 'BASE64_CHUNK:' in line:
parts = line.split('BASE64_CHUNK:',1)[1].strip()
chunks.append(parts)
else:
# fallback: if line looks like base64 (long and only base64 chars + =), take it
s = line.strip()
if len(s) > 100 and re.fullmatch(r'[A-Za-z0-9+/=\n\r]+', s):
chunks.append(s)
if not chunks:
print('ERROR: no base64 chunks found in log at', log_path)
sys.exit(2)
b64 = ''.join(chunks)
# sanitize (remove any DEBUG prefixes that snuck in)
b64 = re.sub(r'\s+', '', b64)
try:
data = base64.b64decode(b64)
except Exception as e:
print('ERROR decoding base64:', e)
sys.exit(3)
docx_path = os.path.join(out_dir, 'downloaded.docx')
with open(docx_path, 'wb') as f:
f.write(data)
print('WROTE_DOCX:', docx_path, 'size=', os.path.getsize(docx_path))
# unzip
unzip_dir = os.path.join(out_dir, 'unzipped')
os.makedirs(unzip_dir, exist_ok=True)
try:
with zipfile.ZipFile(docx_path, 'r') as z:
z.extractall(unzip_dir)
except Exception as e:
print('ERROR unzipping docx:', e)
sys.exit(4)
doc_xml = os.path.join(unzip_dir, 'word', 'document.xml')
if not os.path.exists(doc_xml):
print('ERROR: word/document.xml not found in the docx')
sys.exit(5)
# parse XML and extract tables
ns = {'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
ET.register_namespace('w', ns['w'])
try:
tree = ET.parse(doc_xml)
root = tree.getroot()
except Exception as e:
print('ERROR parsing document.xml:', e)
sys.exit(6)
tables = root.findall('.//w:tbl', ns)
print('TABLE_COUNT:', len(tables))
# For each table, collect row texts (limit output to first 5 tables and 20 rows each)
found_def_texts = []
for ti, tbl in enumerate(tables[:5], start=1):
rows = tbl.findall('.//w:tr', ns)
print('\n--- TABLE', ti, 'rows=', len(rows), '---')
for ri, tr in enumerate(rows[:20], start=1):
texts = [t.text for t in tr.findall('.//w:t', ns) if t.text]
joined = ' | '.join(texts).strip()
if joined:
print('ROW %d:'%ri, repr(joined))
# heuristic: look for keywords
if any(k.lower() in joined.lower() for k in ('deficiency','description','defect','ac-','AC-','DeficiencyList')):
found_def_texts.append(joined)
else:
print('ROW %d: <empty>'%ri)
# Also search whole document.xml for certain keywords
full_xml = open(doc_xml,'r',encoding='utf-8',errors='replace').read()
keywords = ['Deficiency','DeficiencyList','Description','<TableRow','AC-']
hits = {k: (full_xml.count(k)) for k in keywords}
print('\nKEYWORD_COUNTS:')
for k,v in hits.items():
print(k+':', v)
print('\nFOUND_DEFICIENCY_TEXTS_COUNT:', len(found_def_texts))
for i, txt in enumerate(found_def_texts[:20], start=1):
print('FOUND_%d:'%i, txt)
# Exit with success
print('\nSUMMARY: docx_size=%d tables=%d deficiency_text_found=%s' % (os.path.getsize(docx_path), len(tables), bool(found_def_texts)))
print('OUTPUT_DIR:', os.path.abspath(out_dir))