recipe-manager/src/backend/services/CopyMeThatHtmlParser.ts

249 lines
8.0 KiB
TypeScript

import type { CreateRecipeInput } from '../types/recipe.js';
export interface ParsedCopyMeThatRecipe {
title: string;
sourceUrl?: string;
description?: string;
imageUrl?: string;
tags: string[];
made: boolean;
rating?: number;
servings?: string;
ingredients: string[];
instructions: string[];
notes?: string;
}
/**
* Parses CopyMeThat HTML export format.
* Supports both single-recipe HTML and multi-recipe exports.
*/
export class CopyMeThatHtmlParser {
/**
* Parse all recipes from a CopyMeThat HTML export file.
*/
parseRecipes(html: string): ParsedCopyMeThatRecipe[] {
const recipeBlocks = this.extractRecipeBlocks(html);
console.log(`[CopyMeThatHtmlParser] Found ${recipeBlocks.length} recipe blocks`);
const parsed = recipeBlocks.map(block => this.parseRecipeBlock(block)).filter(r => r !== null) as ParsedCopyMeThatRecipe[];
console.log(`[CopyMeThatHtmlParser] Successfully parsed ${parsed.length} recipes`);
return parsed;
}
/**
* Extract individual recipe HTML blocks from the document.
*/
private extractRecipeBlocks(html: string): string[] {
const blocks: string[] = [];
// Match with flexible whitespace around = and quotes
const recipeRegex = /<div\s+class\s*=\s*["']recipe["'][^>]*>([\s\S]*?)(?=<div\s+class\s*=\s*["']recipe["']|<\/body>|$)/gi;
let match;
while ((match = recipeRegex.exec(html)) !== null) {
blocks.push(match[0]);
}
return blocks;
}
/**
* Parse a single recipe HTML block.
*/
private parseRecipeBlock(html: string): ParsedCopyMeThatRecipe | null {
try {
const title = this.extractById(html, 'name');
const sourceUrl = this.extractLinkById(html, 'original_link');
const description = this.extractById(html, 'description');
const imageUrl = this.extractAttr(html, /<img\s+class\s*=\s*["']recipeImage["'][^>]*src\s*=\s*["']([^"']+)["']/i);
const tags = this.extractTags(html);
const made = html.includes('id="made_this"') || html.includes("id = \"made_this\"") || html.includes('I made this');
const rating = this.extractRating(html);
const servings = this.extractText(html, /<a\s+id\s*=\s*["']recipeYield["'][^>]*>([^<]+)<\/a>/i);
const ingredients = this.extractListItems(html, 'recipeIngredient');
const instructions = this.extractListItems(html, 'instruction');
const notes = this.extractNotes(html);
if (!title || ingredients.length === 0 || instructions.length === 0) {
console.log(`[Parser] Rejected recipe - title: ${!!title}, ingredients: ${ingredients.length}, instructions: ${instructions.length}`);
return null; // Invalid recipe
}
return {
title: this.cleanText(title),
sourceUrl: sourceUrl || undefined,
description: description ? this.cleanText(description) : undefined,
imageUrl: imageUrl || undefined,
tags,
made,
rating,
servings: servings ? this.cleanText(servings) : undefined,
ingredients: ingredients.map(i => this.cleanText(i)),
instructions: instructions.map(i => this.cleanText(i)),
notes: notes ? this.cleanText(notes) : undefined,
};
} catch (error) {
console.error('Error parsing recipe block:', error);
return null;
}
}
/**
* Extract text content from a regex match.
*/
private extractText(html: string, regex: RegExp): string | null {
const match = regex.exec(html);
return match ? match[1] : null;
}
/**
* Extract attribute value from a regex match.
*/
private extractAttr(html: string, regex: RegExp): string | null {
const match = regex.exec(html);
return match ? match[1] : null;
}
/**
* Extract text from an element by id (handles spaces around =).
* Only captures immediate text content, not nested elements.
*/
private extractById(html: string, id: string): string | null {
const regex = new RegExp(`<div\\s+id\\s*=\\s*["']${id}["'][^>]*>\\s*([^<]+)`, 'i');
const match = regex.exec(html);
return match ? match[1].trim() : null;
}
/**
* Extract href from a link by id (handles spaces around =).
*/
private extractLinkById(html: string, id: string): string | null {
const regex = new RegExp(`<a\\s+id\\s*=\\s*["']${id}["'][^>]*href\\s*=\\s*["']([^"']+)["']`, 'i');
return this.extractAttr(html, regex);
}
/**
* Extract tags from categories section.
*/
private extractTags(html: string): string[] {
const tags: string[] = [];
const tagRegex = /<span\s+class\s*=\s*["']recipeCategory["'][^>]*>([^<]+)<\/span>/gi;
let match;
while ((match = tagRegex.exec(html)) !== null) {
const tag = this.cleanText(match[1]);
if (tag) tags.push(tag);
}
return tags;
}
/**
* Extract rating value (1-5).
*/
private extractRating(html: string): number | undefined {
const ratingMatch = /<span\s+id\s*=\s*["']ratingValue["'][^>]*>(\d+)<\/span>/i.exec(html);
if (ratingMatch) {
const rating = parseInt(ratingMatch[1], 10);
return (rating >= 1 && rating <= 5) ? rating : undefined;
}
return undefined;
}
/**
* Extract list items (ingredients or instructions).
*/
private extractListItems(html: string, className: string): string[] {
const items: string[] = [];
// More flexible regex to handle spaces around =
const itemRegex = new RegExp(`<li\\s+class\\s*=\\s*["']${className}["'][^>]*>([\\s\\S]*?)<\\/li>`, 'gi');
let match;
while ((match = itemRegex.exec(html)) !== null) {
const text = this.cleanText(match[1]);
if (text) items.push(text);
}
return items;
}
/**
* Extract recipe notes.
*/
private extractNotes(html: string): string | null {
const notesMatch = /<div\s+id\s*=\s*["']recipeNotes["'][^>]*>([\s\S]*?)<\/div>/i.exec(html);
if (!notesMatch) return null;
const notesHtml = notesMatch[1];
const noteTexts: string[] = [];
const noteRegex = /<div\s+class\s*=\s*["']recipeNote["'][^>]*>([\\s\\S]*?)<\/div>/gi;
let match;
while ((match = noteRegex.exec(notesHtml)) !== null) {
const note = this.cleanText(match[1]);
if (note) noteTexts.push(note);
}
return noteTexts.length > 0 ? noteTexts.join('\n\n') : null;
}
/**
* Clean HTML entities and extra whitespace from text.
*/
private cleanText(text: string): string {
return text
.replace(/<[^>]+>/g, '') // Remove HTML tags
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&amp;/g, '&')
.replace(/&#39;/g, "'")
.replace(/&nbsp;/g, ' ')
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
}
/**
* Convert parsed recipe to CreateRecipeInput format.
*/
toCreateRecipeInput(parsed: ParsedCopyMeThatRecipe): CreateRecipeInput {
// Normalize image URL: if relative path (images/...), convert to absolute /images/...
let imageUrl = parsed.imageUrl;
if (imageUrl) {
if (imageUrl.startsWith('images/')) {
imageUrl = '/' + imageUrl;
} else if (!imageUrl.startsWith('http://') && !imageUrl.startsWith('https://')) {
// Other relative paths or invalid URLs — discard
imageUrl = undefined;
}
}
return {
title: parsed.title,
description: parsed.description,
source_url: parsed.sourceUrl,
image_url: imageUrl || undefined,
made: parsed.made,
rating: parsed.rating,
notes: parsed.notes,
servings: parsed.servings ? this.extractServingCount(parsed.servings) : undefined,
ingredients: parsed.ingredients.map((item, index) => ({
item,
position: index,
})),
steps: parsed.instructions.map((instruction, index) => ({
instruction,
position: index,
})),
};
}
/**
* Try to extract numeric serving count from serving string.
*/
private extractServingCount(servingStr: string): number | undefined {
const match = /(\d+)\s*servings?/i.exec(servingStr);
return match ? parseInt(match[1], 10) : undefined;
}
}