Neah/lib/utils/email-content.ts
2025-05-01 17:01:26 +02:00

386 lines
14 KiB
TypeScript

/**
* Centralized Email Content Utilities
*
* This file contains all core functions for email content processing:
* - Content extraction
* - HTML sanitization
* - Text direction handling
* - URL fixing
*
* Other modules should import from this file rather than implementing their own versions.
*/
import { sanitizeHtml } from './dom-purify-config';
import { detectTextDirection } from './text-direction';
import { EmailContent } from '@/types/email';
import { processCidReferences } from './email-utils';
/**
* Extract content from various possible email formats
* Centralized implementation to reduce duplication across the codebase
*/
export function extractEmailContent(email: any): { text: string; html: string; isHtml: boolean; direction: 'ltr' | 'rtl'; } {
// Default empty values
let textContent = '';
let htmlContent = '';
let isHtml = false;
let direction: 'ltr' | 'rtl' = 'ltr';
// Early exit if no email
if (!email) {
console.log('extractEmailContent: No email provided');
return { text: '', html: '', isHtml: false, direction: 'ltr' };
}
try {
// Extract based on common formats
if (email.content && typeof email.content === 'object') {
// Standard format with content object
textContent = email.content.text || '';
htmlContent = email.content.html || '';
isHtml = email.content.isHtml || !!htmlContent;
direction = email.content.direction || 'ltr';
// Handle complex email formats where content might be nested
if (!textContent && !htmlContent) {
// Try to find content in deeper nested structure
if (email.content.body) {
if (typeof email.content.body === 'string') {
// Determine if body is HTML or text
if (isHtmlContent(email.content.body)) {
htmlContent = email.content.body;
isHtml = true;
} else {
textContent = email.content.body;
isHtml = false;
}
} else if (typeof email.content.body === 'object' && email.content.body) {
// Some email formats nest content inside body
htmlContent = email.content.body.html || '';
textContent = email.content.body.text || '';
isHtml = email.content.body.isHtml || !!htmlContent;
direction = email.content.body.direction || 'ltr';
}
}
// Check for data property which some email services use
if (!textContent && !htmlContent && email.content.data) {
if (typeof email.content.data === 'string') {
// Check if data looks like HTML
if (isHtmlContent(email.content.data)) {
htmlContent = email.content.data;
isHtml = true;
} else {
textContent = email.content.data;
isHtml = false;
}
}
}
}
} else if (typeof email.content === 'string') {
// Check if content is likely HTML
if (isHtmlContent(email.content)) {
htmlContent = email.content;
isHtml = true;
} else {
textContent = email.content;
isHtml = false;
}
} else {
// Check other common properties
htmlContent = email.html || '';
textContent = email.text || '';
isHtml = email.isHtml || !!htmlContent;
direction = email.direction || 'ltr';
// If still no content, check for less common properties
if (!htmlContent && !textContent) {
// Try additional properties that some email clients use
htmlContent = email.body?.html || email.bodyHtml || email.htmlBody || '';
textContent = email.body?.text || email.bodyText || email.plainText || '';
isHtml = email.body?.isHtml || !!htmlContent;
direction = email.body?.direction || 'ltr';
}
}
} catch (error) {
console.error('Error extracting email content:', error);
}
// Ensure we always have at least some text content
if (!textContent && htmlContent) {
textContent = extractTextFromHtml(htmlContent);
}
// Log extraction results
console.log('Extracted email content:', {
hasHtml: !!htmlContent,
htmlLength: htmlContent?.length || 0,
hasText: !!textContent,
textLength: textContent?.length || 0,
isHtml,
direction
});
return { text: textContent, html: htmlContent, isHtml, direction };
}
/**
* Extract plain text from HTML content
*/
export function extractTextFromHtml(html: string): string {
if (!html) return '';
try {
// Use DOM API if available
if (typeof window !== 'undefined' && typeof document !== 'undefined') {
const tempDiv = document.createElement('div');
tempDiv.innerHTML = html;
return tempDiv.textContent || tempDiv.innerText || '';
} else {
// Simple regex fallback for non-browser environments
return html.replace(/<[^>]*>/g, ' ')
.replace(/&nbsp;/g, ' ')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&amp;/g, '&')
.replace(/\s+/g, ' ')
.trim();
}
} catch (e) {
console.error('Error extracting text from HTML:', e);
// Fallback to basic strip
return html.replace(/<[^>]*>/g, ' ').trim();
}
}
/**
* Check if a string is likely HTML content
*/
export function isHtmlContent(content: string): boolean {
if (!content) return false;
return content.trim().startsWith('<') &&
(content.includes('<html') ||
content.includes('<body') ||
content.includes('<div') ||
content.includes('<p>') ||
content.includes('<br>'));
}
/**
* Format and standardize email content for display following email industry standards.
* This is the main entry point for rendering email content.
*/
export function formatEmailContent(email: any): string {
if (!email) {
console.log('formatEmailContent: No email provided');
return '';
}
try {
// Extract content from email
const { text, html, isHtml, direction } = extractEmailContent(email);
console.log('formatEmailContent processing:', {
hasHtml: !!html,
htmlLength: html?.length || 0,
hasText: !!text,
textLength: text?.length || 0,
emailType: typeof email === 'string' ? 'string' : 'object',
isHtml,
direction
});
// If we have HTML content, sanitize and standardize it
if (html) {
// Process HTML content
const processed = processHtmlContent(html, { sanitize: true });
console.log('HTML content processed:', {
processedLength: processed.sanitizedContent?.length || 0,
isEmpty: !processed.sanitizedContent || processed.sanitizedContent.trim().length === 0
});
// Apply styling
return `<div class="email-content" style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif; line-height: 1.6; color: #333; max-width: 100%; overflow-x: auto; overflow-wrap: break-word; word-wrap: break-word;" dir="${processed.direction}">${processed.sanitizedContent}</div>`;
}
// If we only have text content, format it properly
else if (text) {
console.log('Using plain text formatting');
return formatPlainTextToHtml(text);
}
// Default case: empty or unrecognized content
return '<div class="email-content-empty" style="padding: 20px; text-align: center; color: #666;">No content available</div>';
} catch (error) {
console.error('formatEmailContent: Error formatting email content:', error);
return `<div class="email-content-error" style="padding: 15px; color: #721c24; background-color: #f8d7da; border: 1px solid #f5c6cb; border-radius: 4px;"><p>Error displaying email content</p><p style="font-size: 12px; margin-top: 10px;">${error instanceof Error ? error.message : 'Unknown error'}</p></div>`;
}
}
/**
* Process HTML content to ensure safe rendering and proper formatting
*/
export function processHtmlContent(
htmlContent: string,
options?: {
sanitize?: boolean;
blockExternalContent?: boolean;
preserveReplyFormat?: boolean;
attachments?: Array<{
filename?: string;
name?: string;
contentType?: string;
content?: string;
contentId?: string;
}>;
} | string // Support for legacy textContent parameter
): {
sanitizedContent: string;
hasImages: boolean;
hasExternalContent: boolean;
direction: 'ltr' | 'rtl';
} {
// Handle legacy string parameter (textContent)
if (typeof options === 'string') {
options = { sanitize: true };
}
console.log('Processing HTML content:', {
contentLength: htmlContent?.length || 0,
startsWithHtml: htmlContent?.startsWith('<html'),
startsWithDiv: htmlContent?.startsWith('<div'),
containsForwardedMessage: htmlContent?.includes('---------- Forwarded message ----------'),
containsQuoteHeader: htmlContent?.includes('<div class="gmail_quote"'),
sanitize: options?.sanitize,
preserveReplyFormat: options?.preserveReplyFormat,
blockExternalContent: options?.blockExternalContent,
hasAttachments: options?.attachments?.length || 0
});
if (!htmlContent) {
return {
sanitizedContent: '',
hasImages: false,
hasExternalContent: false,
direction: 'ltr',
};
}
// Store the original content for comparison
const originalContent = htmlContent;
// Process CID references before sanitization
if (options?.attachments?.length) {
console.log('Processing CID references in processHtmlContent');
htmlContent = processCidReferences(htmlContent, options.attachments);
}
try {
// Special handling for reply/forwarded content with less aggressive sanitization
const isReplyOrForward = options?.preserveReplyFormat === true;
// Apply sanitization by default unless explicitly turned off
let sanitizedContent = (options?.sanitize !== false)
? sanitizeHtml(htmlContent, { preserveReplyFormat: isReplyOrForward })
: htmlContent;
// Log content changes from sanitization
console.log('HTML sanitization results:', {
originalLength: originalContent.length,
sanitizedLength: sanitizedContent.length,
difference: originalContent.length - sanitizedContent.length,
percentRemoved: ((originalContent.length - sanitizedContent.length) / originalContent.length * 100).toFixed(2) + '%',
isEmpty: !sanitizedContent || sanitizedContent.trim().length === 0,
isReplyOrForward: isReplyOrForward
});
// Detect if content is a forwarded message to ensure special handling for tables
const isForwardedEmail =
sanitizedContent.includes('---------- Forwarded message ----------') ||
sanitizedContent.includes('Forwarded message') ||
(sanitizedContent.includes('From:') && sanitizedContent.includes('Date:') &&
sanitizedContent.includes('Subject:') && sanitizedContent.includes('To:'));
// Special processing for forwarded email styling
if (isForwardedEmail || isReplyOrForward) {
console.log('Detected forwarded email or reply content, enhancing structure');
// Make sure we're not removing important table structures
sanitizedContent = sanitizedContent
// Preserve table styling for email headers
.replace(/<table([^>]*)>/g, '<table$1 style="margin: 10px 0; border-collapse: collapse; font-size: 13px; color: #333;">')
.replace(/<td([^>]*)>/g, '<td$1 style="padding: 3px 5px; vertical-align: top;">')
// Ensure blockquote styling is preserved
.replace(/<blockquote([^>]*)>/g, '<blockquote$1 style="margin: 0; padding-left: 10px; border-left: 3px solid #ddd; color: #505050; background-color: #f9f9f9; padding: 10px;">');
}
// Fix common email client quirks without breaking cid: URLs
sanitizedContent = sanitizedContent
// Fix for Outlook WebVML content
.replace(/<!--\[if\s+gte\s+mso/g, '<!--[if gte mso')
// Fix for broken image paths starting with // (add https:)
.replace(/src="\/\//g, 'src="https://')
// Handle mixed content issues by converting http:// to https://
.replace(/src="http:\/\//g, 'src="https://')
// Fix email signature line breaks
.replace(/--<br>/g, '<hr style="border-top: 1px solid #ccc; margin: 15px 0;">')
.replace(/-- <br>/g, '<hr style="border-top: 1px solid #ccc; margin: 15px 0;">')
// Remove excessive whitespace from the HTML string itself
.replace(/>\s+</g, '> <');
// Additional processing for quoted content in replies/forwards
if (sanitizedContent.includes('blockquote')) {
console.log('Enhancing blockquote styling');
sanitizedContent = sanitizedContent
// Ensure blockquotes have proper styling
.replace(/<blockquote([^>]*)>/g, (match, attrs) => {
if (match.includes('style=')) {
return match; // Already has style
}
return `<blockquote${attrs} style="margin: 0; padding-left: 10px; border-left: 3px solid #ddd; color: #505050; background-color: #f9f9f9; padding: 10px;">`;
});
}
return {
sanitizedContent,
hasImages: sanitizedContent.includes('<img'),
hasExternalContent: sanitizedContent.includes('https://'),
direction: detectTextDirection(sanitizedContent)
};
} catch (error) {
console.error('Error processing HTML content:', error);
return {
sanitizedContent: htmlContent,
hasImages: false,
hasExternalContent: false,
direction: 'ltr',
};
}
}
/**
* Format plain text to HTML with proper line breaks and styling
*/
export function formatPlainTextToHtml(text: string): string {
if (!text) return '';
// Detect text direction
const direction = detectTextDirection(text);
// Escape HTML characters to prevent XSS
const escapedText = text
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#039;');
// Format plain text with proper line breaks and paragraphs
const formattedText = escapedText
.replace(/\r\n|\r|\n/g, '<br>') // Convert all newlines to <br>
.replace(/((?:<br>){2,})/g, '</p><p>') // Convert multiple newlines to paragraphs
.replace(/<br><\/p>/g, '</p>') // Fix any <br></p> combinations
.replace(/<p><br>/g, '<p>'); // Fix any <p><br> combinations
return `<div class="email-content" style="font-family: -apple-system, BlinkMacSystemFont, Menlo, Monaco, Consolas, 'Courier New', monospace; white-space: pre-wrap; line-height: 1.5; color: #333; padding: 15px; max-width: 100%; overflow-wrap: break-word;" dir="${direction}"><p>${formattedText}</p></div>`;
}