recipe-manager/src/backend/routes/import.ts

285 lines
8.2 KiB
TypeScript

import { Router } from 'express';
import { z } from 'zod';
import { parseSchemaOrgRecipe } from '../services/SchemaOrgRecipeParserService.js';
import { parseHeuristicRecipe } from '../services/HeuristicRecipeParserService.js';
import { UrlImportError, UrlImportService } from '../services/UrlImportService.js';
import type { CreateRecipeInput } from '../types/recipe.js';
import { asyncHandler } from '../middleware.js';
const importUrlSchema = z.object({
url: z.string().url('Please provide a valid URL (including https://).'),
});
interface ImportRouteDraftRecipe {
title: string;
description?: string;
servings?: number;
prep_time_minutes?: number;
cook_time_minutes?: number;
source_url?: string;
image_url?: string;
ingredients: { item: string; quantity?: string | null; unit?: string | null; notes?: string | null }[];
instructions: string[];
tagIds?: number[];
}
interface ImportRouteResult {
title: string;
source_url: string;
json_ld_blocks: unknown[];
draft_recipe: ImportRouteDraftRecipe;
ingredients: string[];
instructions: string[];
parse: {
schema_org_used: boolean;
heuristic_used: boolean;
warnings: string[];
};
}
export function createImportRoutes(urlImportService = new UrlImportService()) {
const router = Router();
router.post('/url', asyncHandler(async (req, res, next) => {
const { url } = importUrlSchema.parse(req.body);
let fetched;
try {
fetched = await urlImportService.fetchFromUrl(url);
} catch (err: any) {
if (err.code && err.code.startsWith('IMPORT_')) {
const mapped = mapUrlImportError(err);
return res.status(mapped.status).json({ success: false, error: mapped.message });
}
return next(err);
}
const parseWarnings: string[] = [];
const parsedJsonLdBlocks = parseJsonLdBlocks(fetched.json_ld_blocks, parseWarnings);
const schemaCandidate = findSchemaOrgRecipeCandidate(parsedJsonLdBlocks);
const schemaDraft = schemaCandidate ? toImportDraftSafe(parseSchemaOrgRecipe(schemaCandidate), fetched.source_url) : null;
const heuristicDraft = schemaDraft
? null
: toHeuristicImportDraft(fetched.html, fetched.source_url);
const draft = schemaDraft ?? heuristicDraft;
if (!draft) {
res.status(422).json({
success: false,
data: null,
error: 'Parse failed: Could not extract a usable recipe from this page.',
});
return;
}
const response: ImportRouteResult = {
title: draft.title,
source_url: fetched.source_url,
json_ld_blocks: parsedJsonLdBlocks,
draft_recipe: draft,
ingredients: draft.ingredients.map((item) => item.item),
instructions: draft.instructions,
parse: {
schema_org_used: Boolean(schemaDraft),
heuristic_used: Boolean(!schemaDraft && heuristicDraft),
warnings: parseWarnings,
},
};
res.json({ success: true, data: response, error: null });
}));
return router;
}
function mapUrlImportError(error: UrlImportError): { status: number; message: string } {
switch (error.code) {
case 'IMPORT_TIMEOUT':
return { status: 504, message: error.message };
case 'IMPORT_NETWORK':
return { status: 502, message: error.message };
case 'IMPORT_UNSUPPORTED_CONTENT':
return { status: 415, message: error.message };
case 'IMPORT_FETCH_FAILED':
default:
return { status: error.status && error.status >= 400 ? error.status : 502, message: error.message };
}
}
function parseJsonLdBlocks(blocks: string[], warnings: string[]): unknown[] {
const parsed: unknown[] = [];
for (const raw of blocks) {
try {
const value = JSON.parse(raw) as unknown;
parsed.push(value);
} catch {
warnings.push('Skipped malformed JSON-LD block.');
}
}
return parsed;
}
function findSchemaOrgRecipeCandidate(blocks: unknown[]): Record<string, unknown> | null {
const candidates: Record<string, unknown>[] = [];
for (const block of blocks) {
collectRecipeCandidates(block, candidates);
}
if (candidates.length === 0) {
return null;
}
return candidates.find((candidate) => typeof candidate.name === 'string') ?? candidates[0];
}
function collectRecipeCandidates(value: unknown, sink: Record<string, unknown>[]): void {
if (!value) return;
if (Array.isArray(value)) {
for (const item of value) {
collectRecipeCandidates(item, sink);
}
return;
}
if (typeof value !== 'object') {
return;
}
const obj = value as Record<string, unknown>;
if (isRecipeType(obj['@type'])) {
sink.push(obj);
}
if ('@graph' in obj) {
collectRecipeCandidates(obj['@graph'], sink);
}
for (const nested of Object.values(obj)) {
if (nested && typeof nested === 'object') {
collectRecipeCandidates(nested, sink);
}
}
}
function isRecipeType(typeValue: unknown): boolean {
if (typeof typeValue === 'string') {
return typeValue.toLowerCase().includes('recipe');
}
if (Array.isArray(typeValue)) {
return typeValue.some((value) => typeof value === 'string' && value.toLowerCase().includes('recipe'));
}
return false;
}
function toImportDraftSafe(parsed: CreateRecipeInput, sourceUrl: string): ImportRouteDraftRecipe | null {
const title = parsed.title?.trim();
const ingredients = Array.isArray(parsed.ingredients)
? parsed.ingredients
.map((ingredient) => ({
item: typeof ingredient.item === 'string' ? ingredient.item.trim() : '',
quantity: typeof ingredient.quantity === 'string' ? ingredient.quantity : null,
unit: typeof ingredient.unit === 'string' ? ingredient.unit : null,
notes: typeof ingredient.notes === 'string' ? ingredient.notes : null,
}))
.filter((ingredient) => ingredient.item.length > 0)
: [];
const instructions = Array.isArray(parsed.steps)
? parsed.steps
.map((step) => (typeof step.instruction === 'string' ? step.instruction.trim() : ''))
.filter((step) => step.length > 0)
: [];
if (!title || ingredients.length === 0 || instructions.length === 0) {
return null;
}
return {
title,
description: parsed.description,
servings: parsed.servings,
prep_time_minutes: parsed.prep_time_minutes,
cook_time_minutes: parsed.cook_time_minutes,
source_url: parsed.source_url || sourceUrl,
image_url: parsed.image_url,
ingredients,
instructions,
tagIds: parsed.tagIds,
};
}
function toHeuristicImportDraft(html: string, sourceUrl: string): ImportRouteDraftRecipe | null {
const title = extractTitle(html) || 'Imported Recipe';
const ingredients = extractListItems(html, ['ingredient']);
const instructions = extractListItems(html, ['instruction', 'direction', 'method', 'step']);
const createInput = parseHeuristicRecipe({
title,
ingredients,
steps: instructions,
source_url: sourceUrl,
});
return toImportDraftSafe(createInput, sourceUrl);
}
function extractTitle(html: string): string | null {
const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
if (!titleMatch || !titleMatch[1]) {
return null;
}
return normalizeText(titleMatch[1]);
}
function extractListItems(html: string, headingKeywords: string[]): string[] {
const sectionPattern = new RegExp(
`<(?:h2|h3|h4)[^>]*>([\\s\\S]*?)<\\/(?:h2|h3|h4)>[\\s\\S]*?<ul[^>]*>([\\s\\S]*?)<\\/ul>`,
'gi',
);
const items: string[] = [];
let match = sectionPattern.exec(html);
while (match) {
const headingText = normalizeText(match[1]);
if (headingKeywords.some((keyword) => headingText.toLowerCase().includes(keyword))) {
const listHtml = match[2] ?? '';
const liPattern = /<li[^>]*>([\s\S]*?)<\/li>/gi;
let liMatch = liPattern.exec(listHtml);
while (liMatch) {
const text = normalizeText(liMatch[1] ?? '');
if (text) {
items.push(text);
}
liMatch = liPattern.exec(listHtml);
}
}
match = sectionPattern.exec(html);
}
return dedupe(items);
}
function normalizeText(text: string): string {
return text
.replace(/<[^>]+>/g, ' ')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/\s+/g, ' ')
.trim();
}
function dedupe(values: string[]): string[] {
return [...new Set(values)];
}