101 lines
3.7 KiB
Python
101 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
import os, re, base64, zipfile, xml.etree.ElementTree as ET, sys
|
|
log_path = '/home/paulh/.vscode-server/data/User/workspaceStorage/79b924110cb5ff6de49811d445e59969-1/GitHub.copilot-chat/chat-session-resources/90e6dae0-2184-412b-af0b-eac258be98c5/call_gY8uUyzuGvFiN46d4ZPVFtjz__vscode-1775271381281/content.txt'
|
|
out_dir = 'artifacts/doc_inspect'
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
start=False
|
|
chunks = []
|
|
with open(log_path, 'r', errors='replace') as f:
|
|
for line in f:
|
|
if 'BASE64_BEGIN' in line:
|
|
start = True
|
|
continue
|
|
if 'BASE64_END' in line:
|
|
break
|
|
if start:
|
|
if 'BASE64_CHUNK:' in line:
|
|
parts = line.split('BASE64_CHUNK:',1)[1].strip()
|
|
chunks.append(parts)
|
|
else:
|
|
# fallback: if line looks like base64 (long and only base64 chars + =), take it
|
|
s = line.strip()
|
|
if len(s) > 100 and re.fullmatch(r'[A-Za-z0-9+/=\n\r]+', s):
|
|
chunks.append(s)
|
|
|
|
if not chunks:
|
|
print('ERROR: no base64 chunks found in log at', log_path)
|
|
sys.exit(2)
|
|
|
|
b64 = ''.join(chunks)
|
|
# sanitize (remove any DEBUG prefixes that snuck in)
|
|
b64 = re.sub(r'\s+', '', b64)
|
|
try:
|
|
data = base64.b64decode(b64)
|
|
except Exception as e:
|
|
print('ERROR decoding base64:', e)
|
|
sys.exit(3)
|
|
|
|
docx_path = os.path.join(out_dir, 'downloaded.docx')
|
|
with open(docx_path, 'wb') as f:
|
|
f.write(data)
|
|
print('WROTE_DOCX:', docx_path, 'size=', os.path.getsize(docx_path))
|
|
|
|
# unzip
|
|
unzip_dir = os.path.join(out_dir, 'unzipped')
|
|
os.makedirs(unzip_dir, exist_ok=True)
|
|
try:
|
|
with zipfile.ZipFile(docx_path, 'r') as z:
|
|
z.extractall(unzip_dir)
|
|
except Exception as e:
|
|
print('ERROR unzipping docx:', e)
|
|
sys.exit(4)
|
|
|
|
doc_xml = os.path.join(unzip_dir, 'word', 'document.xml')
|
|
if not os.path.exists(doc_xml):
|
|
print('ERROR: word/document.xml not found in the docx')
|
|
sys.exit(5)
|
|
|
|
# parse XML and extract tables
|
|
ns = {'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
|
ET.register_namespace('w', ns['w'])
|
|
try:
|
|
tree = ET.parse(doc_xml)
|
|
root = tree.getroot()
|
|
except Exception as e:
|
|
print('ERROR parsing document.xml:', e)
|
|
sys.exit(6)
|
|
|
|
tables = root.findall('.//w:tbl', ns)
|
|
print('TABLE_COUNT:', len(tables))
|
|
# For each table, collect row texts (limit output to first 5 tables and 20 rows each)
|
|
found_def_texts = []
|
|
for ti, tbl in enumerate(tables[:5], start=1):
|
|
rows = tbl.findall('.//w:tr', ns)
|
|
print('\n--- TABLE', ti, 'rows=', len(rows), '---')
|
|
for ri, tr in enumerate(rows[:20], start=1):
|
|
texts = [t.text for t in tr.findall('.//w:t', ns) if t.text]
|
|
joined = ' | '.join(texts).strip()
|
|
if joined:
|
|
print('ROW %d:'%ri, repr(joined))
|
|
# heuristic: look for keywords
|
|
if any(k.lower() in joined.lower() for k in ('deficiency','description','defect','ac-','AC-','DeficiencyList')):
|
|
found_def_texts.append(joined)
|
|
else:
|
|
print('ROW %d: <empty>'%ri)
|
|
|
|
# Also search whole document.xml for certain keywords
|
|
full_xml = open(doc_xml,'r',encoding='utf-8',errors='replace').read()
|
|
keywords = ['Deficiency','DeficiencyList','Description','<TableRow','AC-']
|
|
hits = {k: (full_xml.count(k)) for k in keywords}
|
|
print('\nKEYWORD_COUNTS:')
|
|
for k,v in hits.items():
|
|
print(k+':', v)
|
|
|
|
print('\nFOUND_DEFICIENCY_TEXTS_COUNT:', len(found_def_texts))
|
|
for i, txt in enumerate(found_def_texts[:20], start=1):
|
|
print('FOUND_%d:'%i, txt)
|
|
|
|
# Exit with success
|
|
print('\nSUMMARY: docx_size=%d tables=%d deficiency_text_found=%s' % (os.path.getsize(docx_path), len(tables), bool(found_def_texts)))
|
|
print('OUTPUT_DIR:', os.path.abspath(out_dir))
|