249 lines
8.0 KiB
TypeScript
249 lines
8.0 KiB
TypeScript
import type { CreateRecipeInput } from '../types/recipe.js';
|
|
|
|
export interface ParsedCopyMeThatRecipe {
|
|
title: string;
|
|
sourceUrl?: string;
|
|
description?: string;
|
|
imageUrl?: string;
|
|
tags: string[];
|
|
made: boolean;
|
|
rating?: number;
|
|
servings?: string;
|
|
ingredients: string[];
|
|
instructions: string[];
|
|
notes?: string;
|
|
}
|
|
|
|
/**
|
|
* Parses CopyMeThat HTML export format.
|
|
* Supports both single-recipe HTML and multi-recipe exports.
|
|
*/
|
|
export class CopyMeThatHtmlParser {
|
|
/**
|
|
* Parse all recipes from a CopyMeThat HTML export file.
|
|
*/
|
|
parseRecipes(html: string): ParsedCopyMeThatRecipe[] {
|
|
const recipeBlocks = this.extractRecipeBlocks(html);
|
|
console.log(`[CopyMeThatHtmlParser] Found ${recipeBlocks.length} recipe blocks`);
|
|
const parsed = recipeBlocks.map(block => this.parseRecipeBlock(block)).filter(r => r !== null) as ParsedCopyMeThatRecipe[];
|
|
console.log(`[CopyMeThatHtmlParser] Successfully parsed ${parsed.length} recipes`);
|
|
return parsed;
|
|
}
|
|
|
|
/**
|
|
* Extract individual recipe HTML blocks from the document.
|
|
*/
|
|
private extractRecipeBlocks(html: string): string[] {
|
|
const blocks: string[] = [];
|
|
// Match with flexible whitespace around = and quotes
|
|
const recipeRegex = /<div\s+class\s*=\s*["']recipe["'][^>]*>([\s\S]*?)(?=<div\s+class\s*=\s*["']recipe["']|<\/body>|$)/gi;
|
|
|
|
let match;
|
|
while ((match = recipeRegex.exec(html)) !== null) {
|
|
blocks.push(match[0]);
|
|
}
|
|
|
|
return blocks;
|
|
}
|
|
|
|
/**
|
|
* Parse a single recipe HTML block.
|
|
*/
|
|
private parseRecipeBlock(html: string): ParsedCopyMeThatRecipe | null {
|
|
try {
|
|
const title = this.extractById(html, 'name');
|
|
const sourceUrl = this.extractLinkById(html, 'original_link');
|
|
const description = this.extractById(html, 'description');
|
|
const imageUrl = this.extractAttr(html, /<img\s+class\s*=\s*["']recipeImage["'][^>]*src\s*=\s*["']([^"']+)["']/i);
|
|
|
|
const tags = this.extractTags(html);
|
|
const made = html.includes('id="made_this"') || html.includes("id = \"made_this\"") || html.includes('I made this');
|
|
const rating = this.extractRating(html);
|
|
const servings = this.extractText(html, /<a\s+id\s*=\s*["']recipeYield["'][^>]*>([^<]+)<\/a>/i);
|
|
|
|
const ingredients = this.extractListItems(html, 'recipeIngredient');
|
|
const instructions = this.extractListItems(html, 'instruction');
|
|
const notes = this.extractNotes(html);
|
|
|
|
if (!title || ingredients.length === 0 || instructions.length === 0) {
|
|
console.log(`[Parser] Rejected recipe - title: ${!!title}, ingredients: ${ingredients.length}, instructions: ${instructions.length}`);
|
|
return null; // Invalid recipe
|
|
}
|
|
|
|
return {
|
|
title: this.cleanText(title),
|
|
sourceUrl: sourceUrl || undefined,
|
|
description: description ? this.cleanText(description) : undefined,
|
|
imageUrl: imageUrl || undefined,
|
|
tags,
|
|
made,
|
|
rating,
|
|
servings: servings ? this.cleanText(servings) : undefined,
|
|
ingredients: ingredients.map(i => this.cleanText(i)),
|
|
instructions: instructions.map(i => this.cleanText(i)),
|
|
notes: notes ? this.cleanText(notes) : undefined,
|
|
};
|
|
} catch (error) {
|
|
console.error('Error parsing recipe block:', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract text content from a regex match.
|
|
*/
|
|
private extractText(html: string, regex: RegExp): string | null {
|
|
const match = regex.exec(html);
|
|
return match ? match[1] : null;
|
|
}
|
|
|
|
/**
|
|
* Extract attribute value from a regex match.
|
|
*/
|
|
private extractAttr(html: string, regex: RegExp): string | null {
|
|
const match = regex.exec(html);
|
|
return match ? match[1] : null;
|
|
}
|
|
|
|
/**
|
|
* Extract text from an element by id (handles spaces around =).
|
|
* Only captures immediate text content, not nested elements.
|
|
*/
|
|
private extractById(html: string, id: string): string | null {
|
|
const regex = new RegExp(`<div\\s+id\\s*=\\s*["']${id}["'][^>]*>\\s*([^<]+)`, 'i');
|
|
const match = regex.exec(html);
|
|
return match ? match[1].trim() : null;
|
|
}
|
|
|
|
/**
|
|
* Extract href from a link by id (handles spaces around =).
|
|
*/
|
|
private extractLinkById(html: string, id: string): string | null {
|
|
const regex = new RegExp(`<a\\s+id\\s*=\\s*["']${id}["'][^>]*href\\s*=\\s*["']([^"']+)["']`, 'i');
|
|
return this.extractAttr(html, regex);
|
|
}
|
|
|
|
/**
|
|
* Extract tags from categories section.
|
|
*/
|
|
private extractTags(html: string): string[] {
|
|
const tags: string[] = [];
|
|
const tagRegex = /<span\s+class\s*=\s*["']recipeCategory["'][^>]*>([^<]+)<\/span>/gi;
|
|
|
|
let match;
|
|
while ((match = tagRegex.exec(html)) !== null) {
|
|
const tag = this.cleanText(match[1]);
|
|
if (tag) tags.push(tag);
|
|
}
|
|
|
|
return tags;
|
|
}
|
|
|
|
/**
|
|
* Extract rating value (1-5).
|
|
*/
|
|
private extractRating(html: string): number | undefined {
|
|
const ratingMatch = /<span\s+id\s*=\s*["']ratingValue["'][^>]*>(\d+)<\/span>/i.exec(html);
|
|
if (ratingMatch) {
|
|
const rating = parseInt(ratingMatch[1], 10);
|
|
return (rating >= 1 && rating <= 5) ? rating : undefined;
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Extract list items (ingredients or instructions).
|
|
*/
|
|
private extractListItems(html: string, className: string): string[] {
|
|
const items: string[] = [];
|
|
// More flexible regex to handle spaces around =
|
|
const itemRegex = new RegExp(`<li\\s+class\\s*=\\s*["']${className}["'][^>]*>([\\s\\S]*?)<\\/li>`, 'gi');
|
|
|
|
let match;
|
|
while ((match = itemRegex.exec(html)) !== null) {
|
|
const text = this.cleanText(match[1]);
|
|
if (text) items.push(text);
|
|
}
|
|
|
|
return items;
|
|
}
|
|
|
|
/**
|
|
* Extract recipe notes.
|
|
*/
|
|
private extractNotes(html: string): string | null {
|
|
const notesMatch = /<div\s+id\s*=\s*["']recipeNotes["'][^>]*>([\s\S]*?)<\/div>/i.exec(html);
|
|
if (!notesMatch) return null;
|
|
|
|
const notesHtml = notesMatch[1];
|
|
const noteTexts: string[] = [];
|
|
const noteRegex = /<div\s+class\s*=\s*["']recipeNote["'][^>]*>([\\s\\S]*?)<\/div>/gi;
|
|
|
|
let match;
|
|
while ((match = noteRegex.exec(notesHtml)) !== null) {
|
|
const note = this.cleanText(match[1]);
|
|
if (note) noteTexts.push(note);
|
|
}
|
|
|
|
return noteTexts.length > 0 ? noteTexts.join('\n\n') : null;
|
|
}
|
|
|
|
/**
|
|
* Clean HTML entities and extra whitespace from text.
|
|
*/
|
|
private cleanText(text: string): string {
|
|
return text
|
|
.replace(/<[^>]+>/g, '') // Remove HTML tags
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/&/g, '&')
|
|
.replace(/'/g, "'")
|
|
.replace(/ /g, ' ')
|
|
.replace(/\s+/g, ' ') // Normalize whitespace
|
|
.trim();
|
|
}
|
|
|
|
/**
|
|
* Convert parsed recipe to CreateRecipeInput format.
|
|
*/
|
|
toCreateRecipeInput(parsed: ParsedCopyMeThatRecipe): CreateRecipeInput {
|
|
// Normalize image URL: if relative path (images/...), convert to absolute /images/...
|
|
let imageUrl = parsed.imageUrl;
|
|
if (imageUrl) {
|
|
if (imageUrl.startsWith('images/')) {
|
|
imageUrl = '/' + imageUrl;
|
|
} else if (!imageUrl.startsWith('http://') && !imageUrl.startsWith('https://')) {
|
|
// Other relative paths or invalid URLs — discard
|
|
imageUrl = undefined;
|
|
}
|
|
}
|
|
return {
|
|
title: parsed.title,
|
|
description: parsed.description,
|
|
source_url: parsed.sourceUrl,
|
|
image_url: imageUrl || undefined,
|
|
made: parsed.made,
|
|
rating: parsed.rating,
|
|
notes: parsed.notes,
|
|
servings: parsed.servings ? this.extractServingCount(parsed.servings) : undefined,
|
|
ingredients: parsed.ingredients.map((item, index) => ({
|
|
item,
|
|
position: index,
|
|
})),
|
|
steps: parsed.instructions.map((instruction, index) => ({
|
|
instruction,
|
|
position: index,
|
|
})),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Try to extract numeric serving count from serving string.
|
|
*/
|
|
private extractServingCount(servingStr: string): number | undefined {
|
|
const match = /(\d+)\s*servings?/i.exec(servingStr);
|
|
return match ? parseInt(match[1], 10) : undefined;
|
|
}
|
|
}
|