recipe-manager/src/backend/services/CopyMeThatHtmlParser.ts

import type { CreateRecipeInput } from '../types/recipe.js';

export interface ParsedCopyMeThatRecipe {
  title: string;
  sourceUrl?: string;
  description?: string;
  imageUrl?: string;
  tags: string[];
  made: boolean;
  rating?: number;
  servings?: string;
  ingredients: string[];
  instructions: string[];
  notes?: string;
}

/**
 * Parses CopyMeThat HTML export format.
 * Supports both single-recipe HTML and multi-recipe exports.
 */
export class CopyMeThatHtmlParser {
  /**
   * Parse all recipes from a CopyMeThat HTML export file.
   */
  parseRecipes(html: string): ParsedCopyMeThatRecipe[] {
    const recipeBlocks = this.extractRecipeBlocks(html);
    console.log(`[CopyMeThatHtmlParser] Found ${recipeBlocks.length} recipe blocks`);
    const parsed = recipeBlocks.map(block => this.parseRecipeBlock(block)).filter(r => r !== null) as ParsedCopyMeThatRecipe[];
    console.log(`[CopyMeThatHtmlParser] Successfully parsed ${parsed.length} recipes`);
    return parsed;
  }

  /**
   * Extract individual recipe HTML blocks from the document.
   */
  private extractRecipeBlocks(html: string): string[] {
    const blocks: string[] = [];
    // Match with flexible whitespace around = and quotes
    const recipeRegex = /<div\s+class\s*=\s*["']recipe["'][^>]*>([\s\S]*?)(?=<div\s+class\s*=\s*["']recipe["']|<\/body>|$)/gi;

    let match;
    while ((match = recipeRegex.exec(html)) !== null) {
      blocks.push(match[0]);
    }

    return blocks;
  }

  /**
   * Parse a single recipe HTML block.
   */
  private parseRecipeBlock(html: string): ParsedCopyMeThatRecipe | null {
    try {
      const title = this.extractById(html, 'name');
      const sourceUrl = this.extractLinkById(html, 'original_link');
      const description = this.extractById(html, 'description');
      const imageUrl = this.extractAttr(html, /<img\s+class\s*=\s*["']recipeImage["'][^>]*src\s*=\s*["']([^"']+)["']/i);

      const tags = this.extractTags(html);
      const made = html.includes('id="made_this"') || html.includes("id = \"made_this\"") || html.includes('I made this');
      const rating = this.extractRating(html);
      const servings = this.extractText(html, /<a\s+id\s*=\s*["']recipeYield["'][^>]*>([^<]+)<\/a>/i);

      const ingredients = this.extractListItems(html, 'recipeIngredient');
      const instructions = this.extractListItems(html, 'instruction');
      const notes = this.extractNotes(html);

      if (!title || ingredients.length === 0 || instructions.length === 0) {
        console.log(`[Parser] Rejected recipe - title: ${!!title}, ingredients: ${ingredients.length}, instructions: ${instructions.length}`);
        return null; // Invalid recipe
      }

      return {
        title: this.cleanText(title),
        sourceUrl: sourceUrl || undefined,
        description: description ? this.cleanText(description) : undefined,
        imageUrl: imageUrl || undefined,
        tags,
        made,
        rating,
        servings: servings ? this.cleanText(servings) : undefined,
        ingredients: ingredients.map(i => this.cleanText(i)),
        instructions: instructions.map(i => this.cleanText(i)),
        notes: notes ? this.cleanText(notes) : undefined,
      };
    } catch (error) {
      console.error('Error parsing recipe block:', error);
      return null;
    }
  }

  /**
   * Extract text content from a regex match.
   */
  private extractText(html: string, regex: RegExp): string | null {
    const match = regex.exec(html);
    return match ? match[1] : null;
  }

  /**
   * Extract attribute value from a regex match.
   */
  private extractAttr(html: string, regex: RegExp): string | null {
    const match = regex.exec(html);
    return match ? match[1] : null;
  }

  /**
   * Extract text from an element by id (handles spaces around =).
   * Only captures immediate text content, not nested elements.
   */
  private extractById(html: string, id: string): string | null {
    const regex = new RegExp(`<div\\s+id\\s*=\\s*["']${id}["'][^>]*>\\s*([^<]+)`, 'i');
    const match = regex.exec(html);
    return match ? match[1].trim() : null;
  }

  /**
   * Extract href from a link by id (handles spaces around =).
   */
  private extractLinkById(html: string, id: string): string | null {
    const regex = new RegExp(`<a\\s+id\\s*=\\s*["']${id}["'][^>]*href\\s*=\\s*["']([^"']+)["']`, 'i');
    return this.extractAttr(html, regex);
  }

  /**
   * Extract tags from categories section.
   */
  private extractTags(html: string): string[] {
    const tags: string[] = [];
    const tagRegex = /<span\s+class\s*=\s*["']recipeCategory["'][^>]*>([^<]+)<\/span>/gi;

    let match;
    while ((match = tagRegex.exec(html)) !== null) {
      const tag = this.cleanText(match[1]);
      if (tag) tags.push(tag);
    }

    return tags;
  }

  /**
   * Extract rating value (1-5).
   */
  private extractRating(html: string): number | undefined {
    const ratingMatch = /<span\s+id\s*=\s*["']ratingValue["'][^>]*>(\d+)<\/span>/i.exec(html);
    if (ratingMatch) {
      const rating = parseInt(ratingMatch[1], 10);
      return (rating >= 1 && rating <= 5) ? rating : undefined;
    }
    return undefined;
  }

  /**
   * Extract list items (ingredients or instructions).
   */
  private extractListItems(html: string, className: string): string[] {
    const items: string[] = [];
    // More flexible regex to handle spaces around =
    const itemRegex = new RegExp(`<li\\s+class\\s*=\\s*["']${className}["'][^>]*>([\\s\\S]*?)<\\/li>`, 'gi');

    let match;
    while ((match = itemRegex.exec(html)) !== null) {
      const text = this.cleanText(match[1]);
      if (text) items.push(text);
    }

    return items;
  }

  /**
   * Extract recipe notes.
   */
  private extractNotes(html: string): string | null {
    const notesMatch = /<div\s+id\s*=\s*["']recipeNotes["'][^>]*>([\s\S]*?)<\/div>/i.exec(html);
    if (!notesMatch) return null;

    const notesHtml = notesMatch[1];
    const noteTexts: string[] = [];
    const noteRegex = /<div\s+class\s*=\s*["']recipeNote["'][^>]*>([\\s\\S]*?)<\/div>/gi;

    let match;
    while ((match = noteRegex.exec(notesHtml)) !== null) {
      const note = this.cleanText(match[1]);
      if (note) noteTexts.push(note);
    }

    return noteTexts.length > 0 ? noteTexts.join('\n\n') : null;
  }

  /**
   * Clean HTML entities and extra whitespace from text.
   */
  private cleanText(text: string): string {
    return text
      .replace(/<[^>]+>/g, '') // Remove HTML tags
      .replace(/&lt;/g, '<')
      .replace(/&gt;/g, '>')
      .replace(/&quot;/g, '"')
      .replace(/&amp;/g, '&')
      .replace(/&#39;/g, "'")
      .replace(/&nbsp;/g, ' ')
      .replace(/\s+/g, ' ') // Normalize whitespace
      .trim();
  }

  /**
   * Convert parsed recipe to CreateRecipeInput format.
   */
  toCreateRecipeInput(parsed: ParsedCopyMeThatRecipe): CreateRecipeInput {
    // Normalize image URL: if relative path (images/...), convert to absolute /images/...
    let imageUrl = parsed.imageUrl;
    if (imageUrl) {
      if (imageUrl.startsWith('images/')) {
        imageUrl = '/' + imageUrl;
      } else if (!imageUrl.startsWith('http://') && !imageUrl.startsWith('https://')) {
        // Other relative paths or invalid URLs — discard
        imageUrl = undefined;
      }
    }
    return {
      title: parsed.title,
      description: parsed.description,
      source_url: parsed.sourceUrl,
      image_url: imageUrl || undefined,
      made: parsed.made,
      rating: parsed.rating,
      notes: parsed.notes,
      servings: parsed.servings ? this.extractServingCount(parsed.servings) : undefined,
      ingredients: parsed.ingredients.map((item, index) => ({
        item,
        position: index,
      })),
      steps: parsed.instructions.map((instruction, index) => ({
        instruction,
        position: index,
      })),
    };
  }

  /**
   * Try to extract numeric serving count from serving string.
   */
  private extractServingCount(servingStr: string): number | undefined {
    const match = /(\d+)\s*servings?/i.exec(servingStr);
    return match ? parseInt(match[1], 10) : undefined;
  }
}