feat(import): add heuristic fallback parser for recipe URL imports

This commit is contained in:
Paul Huliganga 2026-03-24 23:43:41 -04:00
parent 3d07ff6a49
commit d3eeeb2833
4 changed files with 157 additions and 1 deletions

View File

@ -37,7 +37,7 @@ MVP is functionally complete (core app + docs + tests).
- [x] Add frontend error states (invalid URL, parse failure, timeout) - [x] Add frontend error states (invalid URL, parse failure, timeout)
### Phase 3: Fallback Parsing + Hardening ### Phase 3: Fallback Parsing + Hardening
- [ ] Add heuristic fallback parser when Schema.org missing - [x] Add heuristic fallback parser when Schema.org missing
- [ ] Add timeout/retry + user-friendly import failure messages - [ ] Add timeout/retry + user-friendly import failure messages
- [ ] Add logging/telemetry for import success/failure reasons - [ ] Add logging/telemetry for import success/failure reasons

View File

@ -2,6 +2,7 @@ import { Router } from 'express';
import { z } from 'zod'; import { z } from 'zod';
import { UrlImportService } from '../services/UrlImportService.js'; import { UrlImportService } from '../services/UrlImportService.js';
import { SchemaOrgRecipeParserService } from '../services/SchemaOrgRecipeParserService.js'; import { SchemaOrgRecipeParserService } from '../services/SchemaOrgRecipeParserService.js';
import { HeuristicRecipeParserService } from '../services/HeuristicRecipeParserService.js';
const importUrlSchema = z.object({ const importUrlSchema = z.object({
url: z.string().url('A valid URL is required'), url: z.string().url('A valid URL is required'),
@ -11,6 +12,7 @@ export function createImportRoutes(): Router {
const router = Router(); const router = Router();
const urlImportService = new UrlImportService(); const urlImportService = new UrlImportService();
const schemaOrgParser = new SchemaOrgRecipeParserService(); const schemaOrgParser = new SchemaOrgRecipeParserService();
const heuristicParser = new HeuristicRecipeParserService();
/** /**
* POST /api/import/url * POST /api/import/url
@ -28,6 +30,11 @@ export function createImportRoutes(): Router {
if (draft) break; if (draft) break;
} }
// Fallback: heuristic HTML parser when Schema.org data is missing/invalid
if (!draft) {
draft = heuristicParser.parseHtml(result.html, result.source_url);
}
res.status(200).json({ res.status(200).json({
success: true, success: true,
data: { ...result, draft_recipe: draft }, data: { ...result, draft_recipe: draft },

View File

@ -0,0 +1,107 @@
import type { CreateRecipeInput } from '../types/recipe.js';
/**
* Lightweight fallback parser for pages without usable Schema.org Recipe JSON-LD.
*/
export class HeuristicRecipeParserService {
parseHtml(html: string, sourceUrl?: string): CreateRecipeInput | null {
const title = this.extractTitle(html);
const ingredients = this.extractSectionList(html, 'ingredients');
const instructions = this.extractSectionList(html, 'instructions')
.concat(this.extractSectionList(html, 'directions'));
const mergedInstructions = this.uniqueNonEmpty(instructions);
if (!title && ingredients.length === 0 && mergedInstructions.length === 0) {
return null;
}
if (ingredients.length === 0 && mergedInstructions.length === 0) {
return null;
}
return {
title: title ?? 'Imported Recipe',
ingredients,
instructions: mergedInstructions,
source_url: sourceUrl,
};
}
private extractTitle(html: string): string | undefined {
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
if (h1Match?.[1]) {
return this.normalizeText(h1Match[1]);
}
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
if (!titleMatch?.[1]) return undefined;
const raw = this.normalizeText(titleMatch[1]);
if (!raw) return undefined;
// Common site title separators (e.g., "Recipe Name | Site")
const split = raw.split(/\s[\-||:]\s/);
return split[0]?.trim() || raw;
}
private extractSectionList(html: string, sectionName: 'ingredients' | 'instructions' | 'directions'): string[] {
const headingPattern = new RegExp(
`<h[1-6][^>]*>\\s*${sectionName}\\s*<\\/h[1-6]>\\s*<(ul|ol)[^>]*>([\\s\\S]*?)<\\/\\1>`,
'i',
);
const headingMatch = html.match(headingPattern);
if (headingMatch?.[2]) {
return this.extractListItems(headingMatch[2]);
}
const classPattern = new RegExp(
`<(ul|ol|div)[^>]*(class|id)=["'][^"']*${sectionName.slice(0, -1)}[^"']*["'][^>]*>([\\s\\S]*?)<\\/\\1>`,
'gi',
);
const candidates: string[] = [];
let match = classPattern.exec(html);
while (match) {
const content = match[3] ?? '';
candidates.push(...this.extractListItems(content));
match = classPattern.exec(html);
}
return this.uniqueNonEmpty(candidates);
}
private extractListItems(sectionHtml: string): string[] {
const listItemRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
const items: string[] = [];
let match = listItemRegex.exec(sectionHtml);
while (match) {
const normalized = this.normalizeText(match[1] ?? '');
if (normalized) {
items.push(normalized);
}
match = listItemRegex.exec(sectionHtml);
}
return this.uniqueNonEmpty(items);
}
private normalizeText(text: string): string {
const withoutTags = text.replace(/<[^>]+>/g, ' ');
const decoded = withoutTags
.replace(/&nbsp;/gi, ' ')
.replace(/&amp;/gi, '&')
.replace(/&quot;/gi, '"')
.replace(/&#39;/gi, "'")
.replace(/&lt;/gi, '<')
.replace(/&gt;/gi, '>');
return decoded.replace(/\s+/g, ' ').trim();
}
private uniqueNonEmpty(values: string[]): string[] {
return [...new Set(values.map((v) => v.trim()).filter(Boolean))];
}
}

View File

@ -91,6 +91,48 @@ describe('Import API', () => {
}); });
}); });
it('should use heuristic fallback parser when Schema.org data is missing', async () => {
const html = `
<html>
<head><title>Easy Banana Bread | Example</title></head>
<body>
<h1>Easy Banana Bread</h1>
<h2>Ingredients</h2>
<ul>
<li>3 ripe bananas</li>
<li>2 cups flour</li>
</ul>
<h2>Instructions</h2>
<ol>
<li>Mash bananas.</li>
<li>Bake at 350°F for 50 minutes.</li>
</ol>
</body>
</html>
`;
vi.spyOn(globalThis, 'fetch').mockResolvedValue({
ok: true,
status: 200,
headers: new Headers({ 'content-type': 'text/html; charset=utf-8' }),
text: async () => html,
} as Response);
const response = await request(app)
.post('/api/import/url')
.send({ url: 'https://example.com/banana-bread' })
.expect(200);
expect(response.body.success).toBe(true);
expect(response.body.data.json_ld_blocks).toEqual([]);
expect(response.body.data.draft_recipe).toEqual({
title: 'Easy Banana Bread',
ingredients: ['3 ripe bananas', '2 cups flour'],
instructions: ['Mash bananas.', 'Bake at 350°F for 50 minutes.'],
source_url: 'https://example.com/banana-bread'
});
});
it('should return draft_recipe as null for non-recipe JSON-LD', async () => { it('should return draft_recipe as null for non-recipe JSON-LD', async () => {
const html = ` const html = `
<html> <html>