import { ProcessLogger } from '../process-logger';

export interface TextChunk {
  content: string;
  startIndex: number;
  endIndex: number;
}

export class TextChunker {
  private processLogger: ProcessLogger;

  constructor(userId: string) {
    this.processLogger = new ProcessLogger(userId);
  }

  /**
   * Extract the first N characters that are most likely to contain document type information
   */
  extractClassificationText(text: string): string {
    // Get first 2000 chars for classification (usually contains document type)
    const firstPart = text.substring(0, 2000);
    
    // Get potential title/header section
    const headerMatch = text.match(/^[\s\S]{0,500}(?=\n\s*\n)/);
    const headerText = headerMatch ? headerMatch[0] : '';

    // Look for key terms in first 20% of document
    const searchArea = text.substring(0, Math.floor(text.length * 0.2));
    const keyTerms = [
      'LEASE AGREEMENT',
      'AMENDMENT',
      'LEASE MODIFICATION',
      'EXTENSION AGREEMENT',
      'SUPPLEMENTAL AGREEMENT'
    ];

    const keyTermMatches = keyTerms.map(term => {
      const match = searchArea.match(new RegExp(`[^\\n]*${term}[^\\n]*`, 'i'));
      return match ? match[0] : '';
    }).filter(Boolean);

    // Combine relevant sections
    return [headerText, ...keyTermMatches, firstPart]
      .filter(Boolean)
      .join('\n')
      .substring(0, 4000); // Keep within reasonable token limit
  }

  /**
   * Break text into chunks for processing
   */
  chunkText(text: string, maxChunkSize: number = 4000): TextChunk[] {
    const chunks: TextChunk[] = [];
    let currentIndex = 0;

    while (currentIndex < text.length) {
      // Find a good break point
      let endIndex = Math.min(currentIndex + maxChunkSize, text.length);
      
      if (endIndex < text.length) {
        // Try to break at paragraph
        const paragraphBreak = text.lastIndexOf('\n\n', endIndex);
        if (paragraphBreak > currentIndex) {
          endIndex = paragraphBreak;
        } else {
          // Try to break at sentence
          const sentenceBreak = text.lastIndexOf('. ', endIndex);
          if (sentenceBreak > currentIndex) {
            endIndex = sentenceBreak + 1;
          }
        }
      }

      chunks.push({
        content: text.substring(currentIndex, endIndex),
        startIndex: currentIndex,
        endIndex
      });

      currentIndex = endIndex;
    }

    return chunks;
  }

  /**
   * Extract key sections from document text
   */
  extractKeySections(text: string): Record<string, string> {
    const sections: Record<string, string> = {};

    // Try to find common lease sections
    const sectionPatterns = {
      premises: /(?:WHEREAS[^]*?|[^]*?)(?:the )?premises[^]*?(?=\n\s*\n|$)/i,
      parties: /(?:THIS AGREEMENT[^]*?BETWEEN[^]*?|PARTIES[^]*?)(?=\n\s*\n|$)/i,
      term: /(?:TERM OF (?:THE )?LEASE|LEASE TERM)[^]*?(?=\n\s*\n|$)/i,
      rent: /(?:BASE RENT|MINIMUM RENT|RENTAL PAYMENTS)[^]*?(?=\n\s*\n|$)/i
    };

    // Extract each section
    for (const [key, pattern] of Object.entries(sectionPatterns)) {
      const match = text.match(pattern);
      if (match) {
        sections[key] = match[0].substring(0, 1000); // Limit section size
      }
    }

    return sections;
  }
}
