salesforce-appraiser-review.../scripts/reconstruct_and_inspect.py

#!/usr/bin/env python3
import os, re, base64, zipfile, xml.etree.ElementTree as ET, sys
log_path = '/home/paulh/.vscode-server/data/User/workspaceStorage/79b924110cb5ff6de49811d445e59969-1/GitHub.copilot-chat/chat-session-resources/90e6dae0-2184-412b-af0b-eac258be98c5/call_gY8uUyzuGvFiN46d4ZPVFtjz__vscode-1775271381281/content.txt'
out_dir = 'artifacts/doc_inspect'
os.makedirs(out_dir, exist_ok=True)
start=False
chunks = []
with open(log_path, 'r', errors='replace') as f:
    for line in f:
        if 'BASE64_BEGIN' in line:
            start = True
            continue
        if 'BASE64_END' in line:
            break
        if start:
            if 'BASE64_CHUNK:' in line:
                parts = line.split('BASE64_CHUNK:',1)[1].strip()
                chunks.append(parts)
            else:
                # fallback: if line looks like base64 (long and only base64 chars + =), take it
                s = line.strip()
                if len(s) > 100 and re.fullmatch(r'[A-Za-z0-9+/=\n\r]+', s):
                    chunks.append(s)

if not chunks:
    print('ERROR: no base64 chunks found in log at', log_path)
    sys.exit(2)

b64 = ''.join(chunks)
# sanitize (remove any DEBUG prefixes that snuck in)
b64 = re.sub(r'\s+', '', b64)
try:
    data = base64.b64decode(b64)
except Exception as e:
    print('ERROR decoding base64:', e)
    sys.exit(3)

docx_path = os.path.join(out_dir, 'downloaded.docx')
with open(docx_path, 'wb') as f:
    f.write(data)
print('WROTE_DOCX:', docx_path, 'size=', os.path.getsize(docx_path))

# unzip
unzip_dir = os.path.join(out_dir, 'unzipped')
os.makedirs(unzip_dir, exist_ok=True)
try:
    with zipfile.ZipFile(docx_path, 'r') as z:
        z.extractall(unzip_dir)
except Exception as e:
    print('ERROR unzipping docx:', e)
    sys.exit(4)

doc_xml = os.path.join(unzip_dir, 'word', 'document.xml')
if not os.path.exists(doc_xml):
    print('ERROR: word/document.xml not found in the docx')
    sys.exit(5)

# parse XML and extract tables
ns = {'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
ET.register_namespace('w', ns['w'])
try:
    tree = ET.parse(doc_xml)
    root = tree.getroot()
except Exception as e:
    print('ERROR parsing document.xml:', e)
    sys.exit(6)

tables = root.findall('.//w:tbl', ns)
print('TABLE_COUNT:', len(tables))
# For each table, collect row texts (limit output to first 5 tables and 20 rows each)
found_def_texts = []
for ti, tbl in enumerate(tables[:5], start=1):
    rows = tbl.findall('.//w:tr', ns)
    print('\n--- TABLE', ti, 'rows=', len(rows), '---')
    for ri, tr in enumerate(rows[:20], start=1):
        texts = [t.text for t in tr.findall('.//w:t', ns) if t.text]
        joined = ' | '.join(texts).strip()
        if joined:
            print('ROW %d:'%ri, repr(joined))
            # heuristic: look for keywords
            if any(k.lower() in joined.lower() for k in ('deficiency','description','defect','ac-','AC-','DeficiencyList')):
                found_def_texts.append(joined)
        else:
            print('ROW %d: <empty>'%ri)

# Also search whole document.xml for certain keywords
full_xml = open(doc_xml,'r',encoding='utf-8',errors='replace').read()
keywords = ['Deficiency','DeficiencyList','Description','<TableRow','AC-']
hits = {k: (full_xml.count(k)) for k in keywords}
print('\nKEYWORD_COUNTS:')
for k,v in hits.items():
    print(k+':', v)

print('\nFOUND_DEFICIENCY_TEXTS_COUNT:', len(found_def_texts))
for i, txt in enumerate(found_def_texts[:20], start=1):
    print('FOUND_%d:'%i, txt)

# Exit with success
print('\nSUMMARY: docx_size=%d tables=%d deficiency_text_found=%s' % (os.path.getsize(docx_path), len(tables), bool(found_def_texts)))
print('OUTPUT_DIR:', os.path.abspath(out_dir))