feat(import): add heuristic fallback parser for recipe URL imports
This commit is contained in:
parent
3d07ff6a49
commit
d3eeeb2833
2
TODO.md
2
TODO.md
|
|
@ -37,7 +37,7 @@ MVP is functionally complete (core app + docs + tests).
|
|||
- [x] Add frontend error states (invalid URL, parse failure, timeout)
|
||||
|
||||
### Phase 3: Fallback Parsing + Hardening
|
||||
- [ ] Add heuristic fallback parser when Schema.org missing
|
||||
- [x] Add heuristic fallback parser when Schema.org missing
|
||||
- [ ] Add timeout/retry + user-friendly import failure messages
|
||||
- [ ] Add logging/telemetry for import success/failure reasons
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import { Router } from 'express';
|
|||
import { z } from 'zod';
|
||||
import { UrlImportService } from '../services/UrlImportService.js';
|
||||
import { SchemaOrgRecipeParserService } from '../services/SchemaOrgRecipeParserService.js';
|
||||
import { HeuristicRecipeParserService } from '../services/HeuristicRecipeParserService.js';
|
||||
|
||||
const importUrlSchema = z.object({
|
||||
url: z.string().url('A valid URL is required'),
|
||||
|
|
@ -11,6 +12,7 @@ export function createImportRoutes(): Router {
|
|||
const router = Router();
|
||||
const urlImportService = new UrlImportService();
|
||||
const schemaOrgParser = new SchemaOrgRecipeParserService();
|
||||
const heuristicParser = new HeuristicRecipeParserService();
|
||||
|
||||
/**
|
||||
* POST /api/import/url
|
||||
|
|
@ -28,6 +30,11 @@ export function createImportRoutes(): Router {
|
|||
if (draft) break;
|
||||
}
|
||||
|
||||
// Fallback: heuristic HTML parser when Schema.org data is missing/invalid
|
||||
if (!draft) {
|
||||
draft = heuristicParser.parseHtml(result.html, result.source_url);
|
||||
}
|
||||
|
||||
res.status(200).json({
|
||||
success: true,
|
||||
data: { ...result, draft_recipe: draft },
|
||||
|
|
|
|||
|
|
@ -0,0 +1,107 @@
|
|||
import type { CreateRecipeInput } from '../types/recipe.js';
|
||||
|
||||
/**
|
||||
* Lightweight fallback parser for pages without usable Schema.org Recipe JSON-LD.
|
||||
*/
|
||||
export class HeuristicRecipeParserService {
|
||||
parseHtml(html: string, sourceUrl?: string): CreateRecipeInput | null {
|
||||
const title = this.extractTitle(html);
|
||||
const ingredients = this.extractSectionList(html, 'ingredients');
|
||||
const instructions = this.extractSectionList(html, 'instructions')
|
||||
.concat(this.extractSectionList(html, 'directions'));
|
||||
|
||||
const mergedInstructions = this.uniqueNonEmpty(instructions);
|
||||
|
||||
if (!title && ingredients.length === 0 && mergedInstructions.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (ingredients.length === 0 && mergedInstructions.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
title: title ?? 'Imported Recipe',
|
||||
ingredients,
|
||||
instructions: mergedInstructions,
|
||||
source_url: sourceUrl,
|
||||
};
|
||||
}
|
||||
|
||||
private extractTitle(html: string): string | undefined {
|
||||
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
||||
if (h1Match?.[1]) {
|
||||
return this.normalizeText(h1Match[1]);
|
||||
}
|
||||
|
||||
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
||||
if (!titleMatch?.[1]) return undefined;
|
||||
|
||||
const raw = this.normalizeText(titleMatch[1]);
|
||||
if (!raw) return undefined;
|
||||
|
||||
// Common site title separators (e.g., "Recipe Name | Site")
|
||||
const split = raw.split(/\s[\-|–|:]\s/);
|
||||
return split[0]?.trim() || raw;
|
||||
}
|
||||
|
||||
private extractSectionList(html: string, sectionName: 'ingredients' | 'instructions' | 'directions'): string[] {
|
||||
const headingPattern = new RegExp(
|
||||
`<h[1-6][^>]*>\\s*${sectionName}\\s*<\\/h[1-6]>\\s*<(ul|ol)[^>]*>([\\s\\S]*?)<\\/\\1>`,
|
||||
'i',
|
||||
);
|
||||
|
||||
const headingMatch = html.match(headingPattern);
|
||||
if (headingMatch?.[2]) {
|
||||
return this.extractListItems(headingMatch[2]);
|
||||
}
|
||||
|
||||
const classPattern = new RegExp(
|
||||
`<(ul|ol|div)[^>]*(class|id)=["'][^"']*${sectionName.slice(0, -1)}[^"']*["'][^>]*>([\\s\\S]*?)<\\/\\1>`,
|
||||
'gi',
|
||||
);
|
||||
|
||||
const candidates: string[] = [];
|
||||
let match = classPattern.exec(html);
|
||||
while (match) {
|
||||
const content = match[3] ?? '';
|
||||
candidates.push(...this.extractListItems(content));
|
||||
match = classPattern.exec(html);
|
||||
}
|
||||
|
||||
return this.uniqueNonEmpty(candidates);
|
||||
}
|
||||
|
||||
private extractListItems(sectionHtml: string): string[] {
|
||||
const listItemRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
|
||||
const items: string[] = [];
|
||||
|
||||
let match = listItemRegex.exec(sectionHtml);
|
||||
while (match) {
|
||||
const normalized = this.normalizeText(match[1] ?? '');
|
||||
if (normalized) {
|
||||
items.push(normalized);
|
||||
}
|
||||
match = listItemRegex.exec(sectionHtml);
|
||||
}
|
||||
|
||||
return this.uniqueNonEmpty(items);
|
||||
}
|
||||
|
||||
private normalizeText(text: string): string {
|
||||
const withoutTags = text.replace(/<[^>]+>/g, ' ');
|
||||
const decoded = withoutTags
|
||||
.replace(/ /gi, ' ')
|
||||
.replace(/&/gi, '&')
|
||||
.replace(/"/gi, '"')
|
||||
.replace(/'/gi, "'")
|
||||
.replace(/</gi, '<')
|
||||
.replace(/>/gi, '>');
|
||||
|
||||
return decoded.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
private uniqueNonEmpty(values: string[]): string[] {
|
||||
return [...new Set(values.map((v) => v.trim()).filter(Boolean))];
|
||||
}
|
||||
}
|
||||
|
|
@ -91,6 +91,48 @@ describe('Import API', () => {
|
|||
});
|
||||
});
|
||||
|
||||
it('should use heuristic fallback parser when Schema.org data is missing', async () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head><title>Easy Banana Bread | Example</title></head>
|
||||
<body>
|
||||
<h1>Easy Banana Bread</h1>
|
||||
<h2>Ingredients</h2>
|
||||
<ul>
|
||||
<li>3 ripe bananas</li>
|
||||
<li>2 cups flour</li>
|
||||
</ul>
|
||||
<h2>Instructions</h2>
|
||||
<ol>
|
||||
<li>Mash bananas.</li>
|
||||
<li>Bake at 350°F for 50 minutes.</li>
|
||||
</ol>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
vi.spyOn(globalThis, 'fetch').mockResolvedValue({
|
||||
ok: true,
|
||||
status: 200,
|
||||
headers: new Headers({ 'content-type': 'text/html; charset=utf-8' }),
|
||||
text: async () => html,
|
||||
} as Response);
|
||||
|
||||
const response = await request(app)
|
||||
.post('/api/import/url')
|
||||
.send({ url: 'https://example.com/banana-bread' })
|
||||
.expect(200);
|
||||
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(response.body.data.json_ld_blocks).toEqual([]);
|
||||
expect(response.body.data.draft_recipe).toEqual({
|
||||
title: 'Easy Banana Bread',
|
||||
ingredients: ['3 ripe bananas', '2 cups flour'],
|
||||
instructions: ['Mash bananas.', 'Bake at 350°F for 50 minutes.'],
|
||||
source_url: 'https://example.com/banana-bread'
|
||||
});
|
||||
});
|
||||
|
||||
it('should return draft_recipe as null for non-recipe JSON-LD', async () => {
|
||||
const html = `
|
||||
<html>
|
||||
|
|
|
|||
Loading…
Reference in New Issue