import type { CreateRecipeInput } from '../types/recipe.js'; export interface ParsedCopyMeThatRecipe { title: string; sourceUrl?: string; description?: string; imageUrl?: string; tags: string[]; made: boolean; rating?: number; servings?: string; ingredients: string[]; instructions: string[]; notes?: string; } /** * Parses CopyMeThat HTML export format. * Supports both single-recipe HTML and multi-recipe exports. */ export class CopyMeThatHtmlParser { /** * Parse all recipes from a CopyMeThat HTML export file. */ parseRecipes(html: string): ParsedCopyMeThatRecipe[] { const recipeBlocks = this.extractRecipeBlocks(html); console.log(`[CopyMeThatHtmlParser] Found ${recipeBlocks.length} recipe blocks`); const parsed = recipeBlocks.map(block => this.parseRecipeBlock(block)).filter(r => r !== null) as ParsedCopyMeThatRecipe[]; console.log(`[CopyMeThatHtmlParser] Successfully parsed ${parsed.length} recipes`); return parsed; } /** * Extract individual recipe HTML blocks from the document. */ private extractRecipeBlocks(html: string): string[] { const blocks: string[] = []; // Match with flexible whitespace around = and quotes const recipeRegex = /]*>([\s\S]*?)(?=|$)/gi; let match; while ((match = recipeRegex.exec(html)) !== null) { blocks.push(match[0]); } return blocks; } /** * Parse a single recipe HTML block. */ private parseRecipeBlock(html: string): ParsedCopyMeThatRecipe | null { try { const title = this.extractById(html, 'name'); const sourceUrl = this.extractLinkById(html, 'original_link'); const description = this.extractById(html, 'description'); const imageUrl = this.extractAttr(html, /]*src\s*=\s*["']([^"']+)["']/i); const tags = this.extractTags(html); const made = html.includes('id="made_this"') || html.includes("id = \"made_this\"") || html.includes('I made this'); const rating = this.extractRating(html); const servings = this.extractText(html, /]*>([^<]+)<\/a>/i); const ingredients = this.extractListItems(html, 'recipeIngredient'); const instructions = this.extractListItems(html, 'instruction'); const notes = this.extractNotes(html); if (!title || ingredients.length === 0 || instructions.length === 0) { console.log(`[Parser] Rejected recipe - title: ${!!title}, ingredients: ${ingredients.length}, instructions: ${instructions.length}`); return null; // Invalid recipe } return { title: this.cleanText(title), sourceUrl: sourceUrl || undefined, description: description ? this.cleanText(description) : undefined, imageUrl: imageUrl || undefined, tags, made, rating, servings: servings ? this.cleanText(servings) : undefined, ingredients: ingredients.map(i => this.cleanText(i)), instructions: instructions.map(i => this.cleanText(i)), notes: notes ? this.cleanText(notes) : undefined, }; } catch (error) { console.error('Error parsing recipe block:', error); return null; } } /** * Extract text content from a regex match. */ private extractText(html: string, regex: RegExp): string | null { const match = regex.exec(html); return match ? match[1] : null; } /** * Extract attribute value from a regex match. */ private extractAttr(html: string, regex: RegExp): string | null { const match = regex.exec(html); return match ? match[1] : null; } /** * Extract text from an element by id (handles spaces around =). * Only captures immediate text content, not nested elements. */ private extractById(html: string, id: string): string | null { const regex = new RegExp(`]*>\\s*([^<]+)`, 'i'); const match = regex.exec(html); return match ? match[1].trim() : null; } /** * Extract href from a link by id (handles spaces around =). */ private extractLinkById(html: string, id: string): string | null { const regex = new RegExp(`]*href\\s*=\\s*["']([^"']+)["']`, 'i'); return this.extractAttr(html, regex); } /** * Extract tags from categories section. */ private extractTags(html: string): string[] { const tags: string[] = []; const tagRegex = /]*>([^<]+)<\/span>/gi; let match; while ((match = tagRegex.exec(html)) !== null) { const tag = this.cleanText(match[1]); if (tag) tags.push(tag); } return tags; } /** * Extract rating value (1-5). */ private extractRating(html: string): number | undefined { const ratingMatch = /]*>(\d+)<\/span>/i.exec(html); if (ratingMatch) { const rating = parseInt(ratingMatch[1], 10); return (rating >= 1 && rating <= 5) ? rating : undefined; } return undefined; } /** * Extract list items (ingredients or instructions). */ private extractListItems(html: string, className: string): string[] { const items: string[] = []; // More flexible regex to handle spaces around = const itemRegex = new RegExp(`]*>([\\s\\S]*?)<\\/li>`, 'gi'); let match; while ((match = itemRegex.exec(html)) !== null) { const text = this.cleanText(match[1]); if (text) items.push(text); } return items; } /** * Extract recipe notes. */ private extractNotes(html: string): string | null { const notesMatch = /]*>([\s\S]*?)<\/div>/i.exec(html); if (!notesMatch) return null; const notesHtml = notesMatch[1]; const noteTexts: string[] = []; const noteRegex = /]*>([\\s\\S]*?)<\/div>/gi; let match; while ((match = noteRegex.exec(notesHtml)) !== null) { const note = this.cleanText(match[1]); if (note) noteTexts.push(note); } return noteTexts.length > 0 ? noteTexts.join('\n\n') : null; } /** * Clean HTML entities and extra whitespace from text. */ private cleanText(text: string): string { return text .replace(/<[^>]+>/g, '') // Remove HTML tags .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/&/g, '&') .replace(/'/g, "'") .replace(/ /g, ' ') .replace(/\s+/g, ' ') // Normalize whitespace .trim(); } /** * Convert parsed recipe to CreateRecipeInput format. */ toCreateRecipeInput(parsed: ParsedCopyMeThatRecipe): CreateRecipeInput { // Normalize image URL: if relative path (images/...), convert to absolute /images/... let imageUrl = parsed.imageUrl; if (imageUrl) { if (imageUrl.startsWith('images/')) { imageUrl = '/' + imageUrl; } else if (!imageUrl.startsWith('http://') && !imageUrl.startsWith('https://')) { // Other relative paths or invalid URLs — discard imageUrl = undefined; } } return { title: parsed.title, description: parsed.description, source_url: parsed.sourceUrl, image_url: imageUrl || undefined, made: parsed.made, rating: parsed.rating, notes: parsed.notes, servings: parsed.servings ? this.extractServingCount(parsed.servings) : undefined, ingredients: parsed.ingredients.map((item, index) => ({ item, position: index, })), steps: parsed.instructions.map((instruction, index) => ({ instruction, position: index, })), }; } /** * Try to extract numeric serving count from serving string. */ private extractServingCount(servingStr: string): number | undefined { const match = /(\d+)\s*servings?/i.exec(servingStr); return match ? parseInt(match[1], 10) : undefined; } }