feat(import): add heuristic fallback parser for recipe URL imports
This commit is contained in:
parent
3d07ff6a49
commit
d3eeeb2833
2
TODO.md
2
TODO.md
|
|
@ -37,7 +37,7 @@ MVP is functionally complete (core app + docs + tests).
|
||||||
- [x] Add frontend error states (invalid URL, parse failure, timeout)
|
- [x] Add frontend error states (invalid URL, parse failure, timeout)
|
||||||
|
|
||||||
### Phase 3: Fallback Parsing + Hardening
|
### Phase 3: Fallback Parsing + Hardening
|
||||||
- [ ] Add heuristic fallback parser when Schema.org missing
|
- [x] Add heuristic fallback parser when Schema.org missing
|
||||||
- [ ] Add timeout/retry + user-friendly import failure messages
|
- [ ] Add timeout/retry + user-friendly import failure messages
|
||||||
- [ ] Add logging/telemetry for import success/failure reasons
|
- [ ] Add logging/telemetry for import success/failure reasons
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ import { Router } from 'express';
|
||||||
import { z } from 'zod';
|
import { z } from 'zod';
|
||||||
import { UrlImportService } from '../services/UrlImportService.js';
|
import { UrlImportService } from '../services/UrlImportService.js';
|
||||||
import { SchemaOrgRecipeParserService } from '../services/SchemaOrgRecipeParserService.js';
|
import { SchemaOrgRecipeParserService } from '../services/SchemaOrgRecipeParserService.js';
|
||||||
|
import { HeuristicRecipeParserService } from '../services/HeuristicRecipeParserService.js';
|
||||||
|
|
||||||
const importUrlSchema = z.object({
|
const importUrlSchema = z.object({
|
||||||
url: z.string().url('A valid URL is required'),
|
url: z.string().url('A valid URL is required'),
|
||||||
|
|
@ -11,6 +12,7 @@ export function createImportRoutes(): Router {
|
||||||
const router = Router();
|
const router = Router();
|
||||||
const urlImportService = new UrlImportService();
|
const urlImportService = new UrlImportService();
|
||||||
const schemaOrgParser = new SchemaOrgRecipeParserService();
|
const schemaOrgParser = new SchemaOrgRecipeParserService();
|
||||||
|
const heuristicParser = new HeuristicRecipeParserService();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* POST /api/import/url
|
* POST /api/import/url
|
||||||
|
|
@ -28,6 +30,11 @@ export function createImportRoutes(): Router {
|
||||||
if (draft) break;
|
if (draft) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Fallback: heuristic HTML parser when Schema.org data is missing/invalid
|
||||||
|
if (!draft) {
|
||||||
|
draft = heuristicParser.parseHtml(result.html, result.source_url);
|
||||||
|
}
|
||||||
|
|
||||||
res.status(200).json({
|
res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
data: { ...result, draft_recipe: draft },
|
data: { ...result, draft_recipe: draft },
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,107 @@
|
||||||
|
import type { CreateRecipeInput } from '../types/recipe.js';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lightweight fallback parser for pages without usable Schema.org Recipe JSON-LD.
|
||||||
|
*/
|
||||||
|
export class HeuristicRecipeParserService {
|
||||||
|
parseHtml(html: string, sourceUrl?: string): CreateRecipeInput | null {
|
||||||
|
const title = this.extractTitle(html);
|
||||||
|
const ingredients = this.extractSectionList(html, 'ingredients');
|
||||||
|
const instructions = this.extractSectionList(html, 'instructions')
|
||||||
|
.concat(this.extractSectionList(html, 'directions'));
|
||||||
|
|
||||||
|
const mergedInstructions = this.uniqueNonEmpty(instructions);
|
||||||
|
|
||||||
|
if (!title && ingredients.length === 0 && mergedInstructions.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ingredients.length === 0 && mergedInstructions.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
title: title ?? 'Imported Recipe',
|
||||||
|
ingredients,
|
||||||
|
instructions: mergedInstructions,
|
||||||
|
source_url: sourceUrl,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractTitle(html: string): string | undefined {
|
||||||
|
const h1Match = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i);
|
||||||
|
if (h1Match?.[1]) {
|
||||||
|
return this.normalizeText(h1Match[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
||||||
|
if (!titleMatch?.[1]) return undefined;
|
||||||
|
|
||||||
|
const raw = this.normalizeText(titleMatch[1]);
|
||||||
|
if (!raw) return undefined;
|
||||||
|
|
||||||
|
// Common site title separators (e.g., "Recipe Name | Site")
|
||||||
|
const split = raw.split(/\s[\-|–|:]\s/);
|
||||||
|
return split[0]?.trim() || raw;
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractSectionList(html: string, sectionName: 'ingredients' | 'instructions' | 'directions'): string[] {
|
||||||
|
const headingPattern = new RegExp(
|
||||||
|
`<h[1-6][^>]*>\\s*${sectionName}\\s*<\\/h[1-6]>\\s*<(ul|ol)[^>]*>([\\s\\S]*?)<\\/\\1>`,
|
||||||
|
'i',
|
||||||
|
);
|
||||||
|
|
||||||
|
const headingMatch = html.match(headingPattern);
|
||||||
|
if (headingMatch?.[2]) {
|
||||||
|
return this.extractListItems(headingMatch[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const classPattern = new RegExp(
|
||||||
|
`<(ul|ol|div)[^>]*(class|id)=["'][^"']*${sectionName.slice(0, -1)}[^"']*["'][^>]*>([\\s\\S]*?)<\\/\\1>`,
|
||||||
|
'gi',
|
||||||
|
);
|
||||||
|
|
||||||
|
const candidates: string[] = [];
|
||||||
|
let match = classPattern.exec(html);
|
||||||
|
while (match) {
|
||||||
|
const content = match[3] ?? '';
|
||||||
|
candidates.push(...this.extractListItems(content));
|
||||||
|
match = classPattern.exec(html);
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.uniqueNonEmpty(candidates);
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractListItems(sectionHtml: string): string[] {
|
||||||
|
const listItemRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
|
||||||
|
const items: string[] = [];
|
||||||
|
|
||||||
|
let match = listItemRegex.exec(sectionHtml);
|
||||||
|
while (match) {
|
||||||
|
const normalized = this.normalizeText(match[1] ?? '');
|
||||||
|
if (normalized) {
|
||||||
|
items.push(normalized);
|
||||||
|
}
|
||||||
|
match = listItemRegex.exec(sectionHtml);
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.uniqueNonEmpty(items);
|
||||||
|
}
|
||||||
|
|
||||||
|
private normalizeText(text: string): string {
|
||||||
|
const withoutTags = text.replace(/<[^>]+>/g, ' ');
|
||||||
|
const decoded = withoutTags
|
||||||
|
.replace(/ /gi, ' ')
|
||||||
|
.replace(/&/gi, '&')
|
||||||
|
.replace(/"/gi, '"')
|
||||||
|
.replace(/'/gi, "'")
|
||||||
|
.replace(/</gi, '<')
|
||||||
|
.replace(/>/gi, '>');
|
||||||
|
|
||||||
|
return decoded.replace(/\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private uniqueNonEmpty(values: string[]): string[] {
|
||||||
|
return [...new Set(values.map((v) => v.trim()).filter(Boolean))];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -91,6 +91,48 @@ describe('Import API', () => {
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should use heuristic fallback parser when Schema.org data is missing', async () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head><title>Easy Banana Bread | Example</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Easy Banana Bread</h1>
|
||||||
|
<h2>Ingredients</h2>
|
||||||
|
<ul>
|
||||||
|
<li>3 ripe bananas</li>
|
||||||
|
<li>2 cups flour</li>
|
||||||
|
</ul>
|
||||||
|
<h2>Instructions</h2>
|
||||||
|
<ol>
|
||||||
|
<li>Mash bananas.</li>
|
||||||
|
<li>Bake at 350°F for 50 minutes.</li>
|
||||||
|
</ol>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
vi.spyOn(globalThis, 'fetch').mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
status: 200,
|
||||||
|
headers: new Headers({ 'content-type': 'text/html; charset=utf-8' }),
|
||||||
|
text: async () => html,
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
const response = await request(app)
|
||||||
|
.post('/api/import/url')
|
||||||
|
.send({ url: 'https://example.com/banana-bread' })
|
||||||
|
.expect(200);
|
||||||
|
|
||||||
|
expect(response.body.success).toBe(true);
|
||||||
|
expect(response.body.data.json_ld_blocks).toEqual([]);
|
||||||
|
expect(response.body.data.draft_recipe).toEqual({
|
||||||
|
title: 'Easy Banana Bread',
|
||||||
|
ingredients: ['3 ripe bananas', '2 cups flour'],
|
||||||
|
instructions: ['Mash bananas.', 'Bake at 350°F for 50 minutes.'],
|
||||||
|
source_url: 'https://example.com/banana-bread'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it('should return draft_recipe as null for non-recipe JSON-LD', async () => {
|
it('should return draft_recipe as null for non-recipe JSON-LD', async () => {
|
||||||
const html = `
|
const html = `
|
||||||
<html>
|
<html>
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue