Neah/lib/utils/email-content.ts

/**
 * Centralized Email Content Utilities
 *
 * This file contains all core functions for email content processing:
 * - Content extraction
 * - HTML sanitization
 * - Text direction handling
 * - URL fixing
 *
 * Other modules should import from this file rather than implementing their own versions.
 */

import { sanitizeHtml } from './dom-purify-config';
import { detectTextDirection } from './text-direction';
import { EmailContent } from '@/types/email';
import { processCidReferences } from './email-utils';

/**
 * Extract content from various possible email formats
 * Centralized implementation to reduce duplication across the codebase
 */
export function extractEmailContent(email: any): { text: string; html: string; isHtml: boolean; direction: 'ltr' | 'rtl'; } {
  // Default empty values
  let textContent = '';
  let htmlContent = '';
  let isHtml = false;
  let direction: 'ltr' | 'rtl' = 'ltr';

  // Early exit if no email
  if (!email) {
    console.log('extractEmailContent: No email provided');
    return { text: '', html: '', isHtml: false, direction: 'ltr' };
  }

  try {
    // Extract based on common formats
    if (email.content && typeof email.content === 'object') {
      // Standard format with content object
      textContent = email.content.text || '';
      htmlContent = email.content.html || '';
      isHtml = email.content.isHtml || !!htmlContent;
      direction = email.content.direction || 'ltr';

      // Handle complex email formats where content might be nested
      if (!textContent && !htmlContent) {
        // Try to find content in deeper nested structure
        if (email.content.body) {
          if (typeof email.content.body === 'string') {
            // Determine if body is HTML or text
            if (isHtmlContent(email.content.body)) {
              htmlContent = email.content.body;
              isHtml = true;
            } else {
              textContent = email.content.body;
              isHtml = false;
            }
          } else if (typeof email.content.body === 'object' && email.content.body) {
            // Some email formats nest content inside body
            htmlContent = email.content.body.html || '';
            textContent = email.content.body.text || '';
            isHtml = email.content.body.isHtml || !!htmlContent;
            direction = email.content.body.direction || 'ltr';
          }
        }

        // Check for data property which some email services use
        if (!textContent && !htmlContent && email.content.data) {
          if (typeof email.content.data === 'string') {
            // Check if data looks like HTML
            if (isHtmlContent(email.content.data)) {
              htmlContent = email.content.data;
              isHtml = true;
            } else {
              textContent = email.content.data;
              isHtml = false;
            }
          }
        }
      }
    } else if (typeof email.content === 'string') {
      // Check if content is likely HTML
      if (isHtmlContent(email.content)) {
        htmlContent = email.content;
        isHtml = true;
      } else {
        textContent = email.content;
        isHtml = false;
      }
    } else {
      // Check other common properties
      htmlContent = email.html || '';
      textContent = email.text || '';
      isHtml = email.isHtml || !!htmlContent;
      direction = email.direction || 'ltr';

      // If still no content, check for less common properties
      if (!htmlContent && !textContent) {
        // Try additional properties that some email clients use
        htmlContent = email.body?.html || email.bodyHtml || email.htmlBody || '';
        textContent = email.body?.text || email.bodyText || email.plainText || '';
        isHtml = email.body?.isHtml || !!htmlContent;
        direction = email.body?.direction || 'ltr';
      }
    }
  } catch (error) {
    console.error('Error extracting email content:', error);
  }

  // Ensure we always have at least some text content
  if (!textContent && htmlContent) {
    textContent = extractTextFromHtml(htmlContent);
  }

  // Log extraction results
  console.log('Extracted email content:', {
    hasHtml: !!htmlContent,
    htmlLength: htmlContent?.length || 0,
    hasText: !!textContent,
    textLength: textContent?.length || 0,
    isHtml,
    direction
  });

  return { text: textContent, html: htmlContent, isHtml, direction };
}

/**
 * Extract plain text from HTML content
 */
export function extractTextFromHtml(html: string): string {
  if (!html) return '';

  try {
    // Use DOM API if available
    if (typeof window !== 'undefined' && typeof document !== 'undefined') {
      const tempDiv = document.createElement('div');
      tempDiv.innerHTML = html;
      return tempDiv.textContent || tempDiv.innerText || '';
    } else {
      // Simple regex fallback for non-browser environments
      return html.replace(/<[^>]*>/g, ' ')
        .replace(/&nbsp;/g, ' ')
        .replace(/&lt;/g, '<')
        .replace(/&gt;/g, '>')
        .replace(/&amp;/g, '&')
        .replace(/\s+/g, ' ')
        .trim();
    }
  } catch (e) {
    console.error('Error extracting text from HTML:', e);
    // Fallback to basic strip
    return html.replace(/<[^>]*>/g, ' ').trim();
  }
}

/**
 * Check if a string is likely HTML content
 */
export function isHtmlContent(content: string): boolean {
  if (!content) return false;

  return content.trim().startsWith('<') &&
    (content.includes('<html') ||
     content.includes('<body') ||
     content.includes('<div') ||
     content.includes('<p>') ||
     content.includes('<br>'));
}

/**
 * Format and standardize email content for display following email industry standards.
 * This is the main entry point for rendering email content.
 */
export function formatEmailContent(email: any): string {
  if (!email) {
    console.log('formatEmailContent: No email provided');
    return '';
  }

  try {
    // Extract content from email
    const { text, html, isHtml, direction } = extractEmailContent(email);

    console.log('formatEmailContent processing:', {
      hasHtml: !!html,
      htmlLength: html?.length || 0,
      hasText: !!text,
      textLength: text?.length || 0,
      emailType: typeof email === 'string' ? 'string' : 'object',
      isHtml,
      direction
    });

    // If we have HTML content, sanitize and standardize it
    if (html) {
      // Process HTML content
      const processed = processHtmlContent(html, { sanitize: true });

      console.log('HTML content processed:', {
        processedLength: processed.sanitizedContent?.length || 0,
        isEmpty: !processed.sanitizedContent || processed.sanitizedContent.trim().length === 0
      });

      // Apply styling
      return `<div class="email-content" style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif; line-height: 1.6; color: #333; max-width: 100%; overflow-x: auto; overflow-wrap: break-word; word-wrap: break-word;" dir="${processed.direction}">${processed.sanitizedContent}</div>`;
    }
    // If we only have text content, format it properly
    else if (text) {
      console.log('Using plain text formatting');
      return formatPlainTextToHtml(text);
    }

    // Default case: empty or unrecognized content
    return '<div class="email-content-empty" style="padding: 20px; text-align: center; color: #666;">No content available</div>';
  } catch (error) {
    console.error('formatEmailContent: Error formatting email content:', error);
    return `<div class="email-content-error" style="padding: 15px; color: #721c24; background-color: #f8d7da; border: 1px solid #f5c6cb; border-radius: 4px;"><p>Error displaying email content</p><p style="font-size: 12px; margin-top: 10px;">${error instanceof Error ? error.message : 'Unknown error'}</p></div>`;
  }
}

/**
 * Process HTML content to ensure safe rendering and proper formatting
 */
export function processHtmlContent(
  htmlContent: string,
  options?: {
    sanitize?: boolean;
    blockExternalContent?: boolean;
    preserveReplyFormat?: boolean;
    attachments?: Array<{
      filename?: string;
      name?: string;
      contentType?: string;
      content?: string;
      contentId?: string;
    }>;
  } | string // Support for legacy textContent parameter
): {
  sanitizedContent: string;
  hasImages: boolean;
  hasExternalContent: boolean;
  direction: 'ltr' | 'rtl';
} {
  // Handle legacy string parameter (textContent)
  if (typeof options === 'string') {
    options = { sanitize: true };
  }

  console.log('Processing HTML content:', {
    contentLength: htmlContent?.length || 0,
    startsWithHtml: htmlContent?.startsWith('<html'),
    startsWithDiv: htmlContent?.startsWith('<div'),
    containsForwardedMessage: htmlContent?.includes('---------- Forwarded message ----------'),
    containsQuoteHeader: htmlContent?.includes('<div class="gmail_quote"'),
    sanitize: options?.sanitize,
    preserveReplyFormat: options?.preserveReplyFormat,
    blockExternalContent: options?.blockExternalContent,
    hasAttachments: options?.attachments?.length || 0
  });

  if (!htmlContent) {
    return {
      sanitizedContent: '',
      hasImages: false,
      hasExternalContent: false,
      direction: 'ltr',
    };
  }

  // Store the original content for comparison
  const originalContent = htmlContent;

  // Process CID references before sanitization
  if (options?.attachments?.length) {
    console.log('Processing CID references in processHtmlContent');
    htmlContent = processCidReferences(htmlContent, options.attachments);
  }

  try {
    // Special handling for reply/forwarded content with less aggressive sanitization
    const isReplyOrForward = options?.preserveReplyFormat === true;

    // Apply sanitization by default unless explicitly turned off
    let sanitizedContent = (options?.sanitize !== false)
      ? sanitizeHtml(htmlContent, { preserveReplyFormat: isReplyOrForward })
      : htmlContent;

    // Log content changes from sanitization
    console.log('HTML sanitization results:', {
      originalLength: originalContent.length,
      sanitizedLength: sanitizedContent.length,
      difference: originalContent.length - sanitizedContent.length,
      percentRemoved: ((originalContent.length - sanitizedContent.length) / originalContent.length * 100).toFixed(2) + '%',
      isEmpty: !sanitizedContent || sanitizedContent.trim().length === 0,
      isReplyOrForward: isReplyOrForward
    });

    // Detect if content is a forwarded message to ensure special handling for tables
    const isForwardedEmail =
      sanitizedContent.includes('---------- Forwarded message ----------') ||
      sanitizedContent.includes('Forwarded message') ||
      (sanitizedContent.includes('From:') && sanitizedContent.includes('Date:') &&
       sanitizedContent.includes('Subject:') && sanitizedContent.includes('To:'));

    // Special processing for forwarded email styling
    if (isForwardedEmail || isReplyOrForward) {
      console.log('Detected forwarded email or reply content, enhancing structure');
      // Make sure we're not removing important table structures
      sanitizedContent = sanitizedContent
        // Preserve table styling for email headers
        .replace(/<table([^>]*)>/g, '<table$1 style="margin: 10px 0; border-collapse: collapse; font-size: 13px; color: #333;">')
        .replace(/<td([^>]*)>/g, '<td$1 style="padding: 3px 5px; vertical-align: top;">')
        // Ensure blockquote styling is preserved
        .replace(/<blockquote([^>]*)>/g, '<blockquote$1 style="margin: 0; padding-left: 10px; border-left: 3px solid #ddd; color: #505050; background-color: #f9f9f9; padding: 10px;">');
    }

    // Fix common email client quirks without breaking cid: URLs
    sanitizedContent = sanitizedContent
      // Fix for Outlook WebVML content
      .replace(/<!--\[if\s+gte\s+mso/g, '<!--[if gte mso')
      // Fix for broken image paths starting with // (add https:)
      .replace(/src="\/\//g, 'src="https://')
      // Handle mixed content issues by converting http:// to https://
      .replace(/src="http:\/\//g, 'src="https://')
      // Fix email signature line breaks
      .replace(/--<br>/g, '<hr style="border-top: 1px solid #ccc; margin: 15px 0;">')
      .replace(/-- <br>/g, '<hr style="border-top: 1px solid #ccc; margin: 15px 0;">')
      // Remove excessive whitespace from the HTML string itself
      .replace(/>\s+</g, '> <');

    // Additional processing for quoted content in replies/forwards
    if (sanitizedContent.includes('blockquote')) {
      console.log('Enhancing blockquote styling');
      sanitizedContent = sanitizedContent
        // Ensure blockquotes have proper styling
        .replace(/<blockquote([^>]*)>/g, (match, attrs) => {
          if (match.includes('style=')) {
            return match; // Already has style
          }
          return `<blockquote${attrs} style="margin: 0; padding-left: 10px; border-left: 3px solid #ddd; color: #505050; background-color: #f9f9f9; padding: 10px;">`;
        });
    }

    return {
      sanitizedContent,
      hasImages: sanitizedContent.includes('<img'),
      hasExternalContent: sanitizedContent.includes('https://'),
      direction: detectTextDirection(sanitizedContent)
    };
  } catch (error) {
    console.error('Error processing HTML content:', error);
    return {
      sanitizedContent: htmlContent,
      hasImages: false,
      hasExternalContent: false,
      direction: 'ltr',
    };
  }
}

/**
 * Format plain text to HTML with proper line breaks and styling
 */
export function formatPlainTextToHtml(text: string): string {
  if (!text) return '';

  // Detect text direction
  const direction = detectTextDirection(text);

  // Escape HTML characters to prevent XSS
  const escapedText = text
    .replace(/&/g, '&amp;')
    .replace(/</g, '&lt;')
    .replace(/>/g, '&gt;')
    .replace(/"/g, '&quot;')
    .replace(/'/g, '&#039;');

  // Format plain text with proper line breaks and paragraphs
  const formattedText = escapedText
    .replace(/\r\n|\r|\n/g, '<br>') // Convert all newlines to <br>
    .replace(/((?:<br>){2,})/g, '</p><p>') // Convert multiple newlines to paragraphs
    .replace(/<br><\/p>/g, '</p>') // Fix any <br></p> combinations
    .replace(/<p><br>/g, '<p>'); // Fix any <p><br> combinations

  return `<div class="email-content" style="font-family: -apple-system, BlinkMacSystemFont, Menlo, Monaco, Consolas, 'Courier New', monospace; white-space: pre-wrap; line-height: 1.5; color: #333; padding: 15px; max-width: 100%; overflow-wrap: break-word;" dir="${direction}"><p>${formattedText}</p></div>`;
}