From d3eeeb28330fc3bc0313642209a99fca74fac2f2 Mon Sep 17 00:00:00 2001 From: Paul Huliganga Date: Tue, 24 Mar 2026 23:43:41 -0400 Subject: [PATCH] feat(import): add heuristic fallback parser for recipe URL imports --- TODO.md | 2 +- src/backend/routes/import.ts | 7 ++ .../services/HeuristicRecipeParserService.ts | 107 ++++++++++++++++++ src/backend/tests/import.test.ts | 42 +++++++ 4 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 src/backend/services/HeuristicRecipeParserService.ts diff --git a/TODO.md b/TODO.md index 537f102..dff5195 100644 --- a/TODO.md +++ b/TODO.md @@ -37,7 +37,7 @@ MVP is functionally complete (core app + docs + tests). - [x] Add frontend error states (invalid URL, parse failure, timeout) ### Phase 3: Fallback Parsing + Hardening -- [ ] Add heuristic fallback parser when Schema.org missing +- [x] Add heuristic fallback parser when Schema.org missing - [ ] Add timeout/retry + user-friendly import failure messages - [ ] Add logging/telemetry for import success/failure reasons diff --git a/src/backend/routes/import.ts b/src/backend/routes/import.ts index 0e668ab..c4190ce 100644 --- a/src/backend/routes/import.ts +++ b/src/backend/routes/import.ts @@ -2,6 +2,7 @@ import { Router } from 'express'; import { z } from 'zod'; import { UrlImportService } from '../services/UrlImportService.js'; import { SchemaOrgRecipeParserService } from '../services/SchemaOrgRecipeParserService.js'; +import { HeuristicRecipeParserService } from '../services/HeuristicRecipeParserService.js'; const importUrlSchema = z.object({ url: z.string().url('A valid URL is required'), @@ -11,6 +12,7 @@ export function createImportRoutes(): Router { const router = Router(); const urlImportService = new UrlImportService(); const schemaOrgParser = new SchemaOrgRecipeParserService(); + const heuristicParser = new HeuristicRecipeParserService(); /** * POST /api/import/url @@ -28,6 +30,11 @@ export function createImportRoutes(): Router { if (draft) break; } + // Fallback: heuristic HTML parser when Schema.org data is missing/invalid + if (!draft) { + draft = heuristicParser.parseHtml(result.html, result.source_url); + } + res.status(200).json({ success: true, data: { ...result, draft_recipe: draft }, diff --git a/src/backend/services/HeuristicRecipeParserService.ts b/src/backend/services/HeuristicRecipeParserService.ts new file mode 100644 index 0000000..1c1711c --- /dev/null +++ b/src/backend/services/HeuristicRecipeParserService.ts @@ -0,0 +1,107 @@ +import type { CreateRecipeInput } from '../types/recipe.js'; + +/** + * Lightweight fallback parser for pages without usable Schema.org Recipe JSON-LD. + */ +export class HeuristicRecipeParserService { + parseHtml(html: string, sourceUrl?: string): CreateRecipeInput | null { + const title = this.extractTitle(html); + const ingredients = this.extractSectionList(html, 'ingredients'); + const instructions = this.extractSectionList(html, 'instructions') + .concat(this.extractSectionList(html, 'directions')); + + const mergedInstructions = this.uniqueNonEmpty(instructions); + + if (!title && ingredients.length === 0 && mergedInstructions.length === 0) { + return null; + } + + if (ingredients.length === 0 && mergedInstructions.length === 0) { + return null; + } + + return { + title: title ?? 'Imported Recipe', + ingredients, + instructions: mergedInstructions, + source_url: sourceUrl, + }; + } + + private extractTitle(html: string): string | undefined { + const h1Match = html.match(/]*>([\s\S]*?)<\/h1>/i); + if (h1Match?.[1]) { + return this.normalizeText(h1Match[1]); + } + + const titleMatch = html.match(/]*>([\s\S]*?)<\/title>/i); + if (!titleMatch?.[1]) return undefined; + + const raw = this.normalizeText(titleMatch[1]); + if (!raw) return undefined; + + // Common site title separators (e.g., "Recipe Name | Site") + const split = raw.split(/\s[\-|–|:]\s/); + return split[0]?.trim() || raw; + } + + private extractSectionList(html: string, sectionName: 'ingredients' | 'instructions' | 'directions'): string[] { + const headingPattern = new RegExp( + `]*>\\s*${sectionName}\\s*<\\/h[1-6]>\\s*<(ul|ol)[^>]*>([\\s\\S]*?)<\\/\\1>`, + 'i', + ); + + const headingMatch = html.match(headingPattern); + if (headingMatch?.[2]) { + return this.extractListItems(headingMatch[2]); + } + + const classPattern = new RegExp( + `<(ul|ol|div)[^>]*(class|id)=["'][^"']*${sectionName.slice(0, -1)}[^"']*["'][^>]*>([\\s\\S]*?)<\\/\\1>`, + 'gi', + ); + + const candidates: string[] = []; + let match = classPattern.exec(html); + while (match) { + const content = match[3] ?? ''; + candidates.push(...this.extractListItems(content)); + match = classPattern.exec(html); + } + + return this.uniqueNonEmpty(candidates); + } + + private extractListItems(sectionHtml: string): string[] { + const listItemRegex = /]*>([\s\S]*?)<\/li>/gi; + const items: string[] = []; + + let match = listItemRegex.exec(sectionHtml); + while (match) { + const normalized = this.normalizeText(match[1] ?? ''); + if (normalized) { + items.push(normalized); + } + match = listItemRegex.exec(sectionHtml); + } + + return this.uniqueNonEmpty(items); + } + + private normalizeText(text: string): string { + const withoutTags = text.replace(/<[^>]+>/g, ' '); + const decoded = withoutTags + .replace(/ /gi, ' ') + .replace(/&/gi, '&') + .replace(/"/gi, '"') + .replace(/'/gi, "'") + .replace(/</gi, '<') + .replace(/>/gi, '>'); + + return decoded.replace(/\s+/g, ' ').trim(); + } + + private uniqueNonEmpty(values: string[]): string[] { + return [...new Set(values.map((v) => v.trim()).filter(Boolean))]; + } +} diff --git a/src/backend/tests/import.test.ts b/src/backend/tests/import.test.ts index 6e14a76..2d4c25a 100644 --- a/src/backend/tests/import.test.ts +++ b/src/backend/tests/import.test.ts @@ -91,6 +91,48 @@ describe('Import API', () => { }); }); + it('should use heuristic fallback parser when Schema.org data is missing', async () => { + const html = ` + + Easy Banana Bread | Example + +

Easy Banana Bread

+

Ingredients

+
    +
  • 3 ripe bananas
  • +
  • 2 cups flour
  • +
+

Instructions

+
    +
  1. Mash bananas.
  2. +
  3. Bake at 350°F for 50 minutes.
  4. +
+ + + `; + + vi.spyOn(globalThis, 'fetch').mockResolvedValue({ + ok: true, + status: 200, + headers: new Headers({ 'content-type': 'text/html; charset=utf-8' }), + text: async () => html, + } as Response); + + const response = await request(app) + .post('/api/import/url') + .send({ url: 'https://example.com/banana-bread' }) + .expect(200); + + expect(response.body.success).toBe(true); + expect(response.body.data.json_ld_blocks).toEqual([]); + expect(response.body.data.draft_recipe).toEqual({ + title: 'Easy Banana Bread', + ingredients: ['3 ripe bananas', '2 cups flour'], + instructions: ['Mash bananas.', 'Bake at 350°F for 50 minutes.'], + source_url: 'https://example.com/banana-bread' + }); + }); + it('should return draft_recipe as null for non-recipe JSON-LD', async () => { const html = `