diff --git a/TODO.md b/TODO.md
index 537f102..dff5195 100644
--- a/TODO.md
+++ b/TODO.md
@@ -37,7 +37,7 @@ MVP is functionally complete (core app + docs + tests).
- [x] Add frontend error states (invalid URL, parse failure, timeout)
### Phase 3: Fallback Parsing + Hardening
-- [ ] Add heuristic fallback parser when Schema.org missing
+- [x] Add heuristic fallback parser when Schema.org missing
- [ ] Add timeout/retry + user-friendly import failure messages
- [ ] Add logging/telemetry for import success/failure reasons
diff --git a/src/backend/routes/import.ts b/src/backend/routes/import.ts
index 0e668ab..c4190ce 100644
--- a/src/backend/routes/import.ts
+++ b/src/backend/routes/import.ts
@@ -2,6 +2,7 @@ import { Router } from 'express';
import { z } from 'zod';
import { UrlImportService } from '../services/UrlImportService.js';
import { SchemaOrgRecipeParserService } from '../services/SchemaOrgRecipeParserService.js';
+import { HeuristicRecipeParserService } from '../services/HeuristicRecipeParserService.js';
const importUrlSchema = z.object({
url: z.string().url('A valid URL is required'),
@@ -11,6 +12,7 @@ export function createImportRoutes(): Router {
const router = Router();
const urlImportService = new UrlImportService();
const schemaOrgParser = new SchemaOrgRecipeParserService();
+ const heuristicParser = new HeuristicRecipeParserService();
/**
* POST /api/import/url
@@ -28,6 +30,11 @@ export function createImportRoutes(): Router {
if (draft) break;
}
+ // Fallback: heuristic HTML parser when Schema.org data is missing/invalid
+ if (!draft) {
+ draft = heuristicParser.parseHtml(result.html, result.source_url);
+ }
+
res.status(200).json({
success: true,
data: { ...result, draft_recipe: draft },
diff --git a/src/backend/services/HeuristicRecipeParserService.ts b/src/backend/services/HeuristicRecipeParserService.ts
new file mode 100644
index 0000000..1c1711c
--- /dev/null
+++ b/src/backend/services/HeuristicRecipeParserService.ts
@@ -0,0 +1,107 @@
+import type { CreateRecipeInput } from '../types/recipe.js';
+
+/**
+ * Lightweight fallback parser for pages without usable Schema.org Recipe JSON-LD.
+ */
+export class HeuristicRecipeParserService {
+ parseHtml(html: string, sourceUrl?: string): CreateRecipeInput | null {
+ const title = this.extractTitle(html);
+ const ingredients = this.extractSectionList(html, 'ingredients');
+ const instructions = this.extractSectionList(html, 'instructions')
+ .concat(this.extractSectionList(html, 'directions'));
+
+ const mergedInstructions = this.uniqueNonEmpty(instructions);
+
+ if (!title && ingredients.length === 0 && mergedInstructions.length === 0) {
+ return null;
+ }
+
+ if (ingredients.length === 0 && mergedInstructions.length === 0) {
+ return null;
+ }
+
+ return {
+ title: title ?? 'Imported Recipe',
+ ingredients,
+ instructions: mergedInstructions,
+ source_url: sourceUrl,
+ };
+ }
+
+ private extractTitle(html: string): string | undefined {
+ const h1Match = html.match(/
]*>([\s\S]*?)<\/h1>/i);
+ if (h1Match?.[1]) {
+ return this.normalizeText(h1Match[1]);
+ }
+
+ const titleMatch = html.match(/]*>([\s\S]*?)<\/title>/i);
+ if (!titleMatch?.[1]) return undefined;
+
+ const raw = this.normalizeText(titleMatch[1]);
+ if (!raw) return undefined;
+
+ // Common site title separators (e.g., "Recipe Name | Site")
+ const split = raw.split(/\s[\-|–|:]\s/);
+ return split[0]?.trim() || raw;
+ }
+
+ private extractSectionList(html: string, sectionName: 'ingredients' | 'instructions' | 'directions'): string[] {
+ const headingPattern = new RegExp(
+ `]*>\\s*${sectionName}\\s*<\\/h[1-6]>\\s*<(ul|ol)[^>]*>([\\s\\S]*?)<\\/\\1>`,
+ 'i',
+ );
+
+ const headingMatch = html.match(headingPattern);
+ if (headingMatch?.[2]) {
+ return this.extractListItems(headingMatch[2]);
+ }
+
+ const classPattern = new RegExp(
+ `<(ul|ol|div)[^>]*(class|id)=["'][^"']*${sectionName.slice(0, -1)}[^"']*["'][^>]*>([\\s\\S]*?)<\\/\\1>`,
+ 'gi',
+ );
+
+ const candidates: string[] = [];
+ let match = classPattern.exec(html);
+ while (match) {
+ const content = match[3] ?? '';
+ candidates.push(...this.extractListItems(content));
+ match = classPattern.exec(html);
+ }
+
+ return this.uniqueNonEmpty(candidates);
+ }
+
+ private extractListItems(sectionHtml: string): string[] {
+ const listItemRegex = /]*>([\s\S]*?)<\/li>/gi;
+ const items: string[] = [];
+
+ let match = listItemRegex.exec(sectionHtml);
+ while (match) {
+ const normalized = this.normalizeText(match[1] ?? '');
+ if (normalized) {
+ items.push(normalized);
+ }
+ match = listItemRegex.exec(sectionHtml);
+ }
+
+ return this.uniqueNonEmpty(items);
+ }
+
+ private normalizeText(text: string): string {
+ const withoutTags = text.replace(/<[^>]+>/g, ' ');
+ const decoded = withoutTags
+ .replace(/ /gi, ' ')
+ .replace(/&/gi, '&')
+ .replace(/"/gi, '"')
+ .replace(/'/gi, "'")
+ .replace(/</gi, '<')
+ .replace(/>/gi, '>');
+
+ return decoded.replace(/\s+/g, ' ').trim();
+ }
+
+ private uniqueNonEmpty(values: string[]): string[] {
+ return [...new Set(values.map((v) => v.trim()).filter(Boolean))];
+ }
+}
diff --git a/src/backend/tests/import.test.ts b/src/backend/tests/import.test.ts
index 6e14a76..2d4c25a 100644
--- a/src/backend/tests/import.test.ts
+++ b/src/backend/tests/import.test.ts
@@ -91,6 +91,48 @@ describe('Import API', () => {
});
});
+ it('should use heuristic fallback parser when Schema.org data is missing', async () => {
+ const html = `
+
+ Easy Banana Bread | Example
+
+ Easy Banana Bread
+ Ingredients
+
+ - 3 ripe bananas
+ - 2 cups flour
+
+ Instructions
+
+ - Mash bananas.
+ - Bake at 350°F for 50 minutes.
+
+
+
+ `;
+
+ vi.spyOn(globalThis, 'fetch').mockResolvedValue({
+ ok: true,
+ status: 200,
+ headers: new Headers({ 'content-type': 'text/html; charset=utf-8' }),
+ text: async () => html,
+ } as Response);
+
+ const response = await request(app)
+ .post('/api/import/url')
+ .send({ url: 'https://example.com/banana-bread' })
+ .expect(200);
+
+ expect(response.body.success).toBe(true);
+ expect(response.body.data.json_ld_blocks).toEqual([]);
+ expect(response.body.data.draft_recipe).toEqual({
+ title: 'Easy Banana Bread',
+ ingredients: ['3 ripe bananas', '2 cups flour'],
+ instructions: ['Mash bananas.', 'Bake at 350°F for 50 minutes.'],
+ source_url: 'https://example.com/banana-bread'
+ });
+ });
+
it('should return draft_recipe as null for non-recipe JSON-LD', async () => {
const html = `