#!/usr/bin/env python3 import os, re, base64, zipfile, xml.etree.ElementTree as ET, sys log_path = '/home/paulh/.vscode-server/data/User/workspaceStorage/79b924110cb5ff6de49811d445e59969-1/GitHub.copilot-chat/chat-session-resources/90e6dae0-2184-412b-af0b-eac258be98c5/call_gY8uUyzuGvFiN46d4ZPVFtjz__vscode-1775271381281/content.txt' out_dir = 'artifacts/doc_inspect' os.makedirs(out_dir, exist_ok=True) start=False chunks = [] with open(log_path, 'r', errors='replace') as f: for line in f: if 'BASE64_BEGIN' in line: start = True continue if 'BASE64_END' in line: break if start: if 'BASE64_CHUNK:' in line: parts = line.split('BASE64_CHUNK:',1)[1].strip() chunks.append(parts) else: # fallback: if line looks like base64 (long and only base64 chars + =), take it s = line.strip() if len(s) > 100 and re.fullmatch(r'[A-Za-z0-9+/=\n\r]+', s): chunks.append(s) if not chunks: print('ERROR: no base64 chunks found in log at', log_path) sys.exit(2) b64 = ''.join(chunks) # sanitize (remove any DEBUG prefixes that snuck in) b64 = re.sub(r'\s+', '', b64) try: data = base64.b64decode(b64) except Exception as e: print('ERROR decoding base64:', e) sys.exit(3) docx_path = os.path.join(out_dir, 'downloaded.docx') with open(docx_path, 'wb') as f: f.write(data) print('WROTE_DOCX:', docx_path, 'size=', os.path.getsize(docx_path)) # unzip unzip_dir = os.path.join(out_dir, 'unzipped') os.makedirs(unzip_dir, exist_ok=True) try: with zipfile.ZipFile(docx_path, 'r') as z: z.extractall(unzip_dir) except Exception as e: print('ERROR unzipping docx:', e) sys.exit(4) doc_xml = os.path.join(unzip_dir, 'word', 'document.xml') if not os.path.exists(doc_xml): print('ERROR: word/document.xml not found in the docx') sys.exit(5) # parse XML and extract tables ns = {'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} ET.register_namespace('w', ns['w']) try: tree = ET.parse(doc_xml) root = tree.getroot() except Exception as e: print('ERROR parsing document.xml:', e) sys.exit(6) tables = root.findall('.//w:tbl', ns) print('TABLE_COUNT:', len(tables)) # For each table, collect row texts (limit output to first 5 tables and 20 rows each) found_def_texts = [] for ti, tbl in enumerate(tables[:5], start=1): rows = tbl.findall('.//w:tr', ns) print('\n--- TABLE', ti, 'rows=', len(rows), '---') for ri, tr in enumerate(rows[:20], start=1): texts = [t.text for t in tr.findall('.//w:t', ns) if t.text] joined = ' | '.join(texts).strip() if joined: print('ROW %d:'%ri, repr(joined)) # heuristic: look for keywords if any(k.lower() in joined.lower() for k in ('deficiency','description','defect','ac-','AC-','DeficiencyList')): found_def_texts.append(joined) else: print('ROW %d: '%ri) # Also search whole document.xml for certain keywords full_xml = open(doc_xml,'r',encoding='utf-8',errors='replace').read() keywords = ['Deficiency','DeficiencyList','Description','