import React, { useState, useRef, useEffect } from 'react';
import Tesseract from 'tesseract.js';
import { Document, Page, pdfjs } from 'react-pdf';
import { IonButton, IonSpinner } from '@ionic/react';
pdfjs.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.js`;

interface PDFExtractorProps {
  file: string | File;
  callback: (data: string) => void;
  keyPhrase?: string; // Used for filtering for the relevant page(s) in a larger PDF (e.g. many-page Morningstar doc).
}

const THRESHOLD_TEXT_LENGTH = 100; // Threshold for determining if the PDF is mostly scanned images

const PDFExtractor: React.FC<PDFExtractorProps> = ({ file, callback, keyPhrase }) => {
  const [numPages, setNumPages] = useState<number>(0);
  const [extractedTextData, setExtractedTextData] = useState<Map<number, unknown>>(new Map());
  const [isProcessing, setIsProcessing] = useState<boolean>(false);
  const [error, setError] = useState<string | null>(null);
  const containerRef = useRef<HTMLDivElement>(null); // Ref for the container

  const handleFileRead = (file: File): Promise<ArrayBuffer> => {
    return new Promise((resolve, reject) => {
      const reader = new FileReader();
      reader.onload = () => {
        if (reader.result) {
          resolve(reader.result as ArrayBuffer);
        } else {
          reject(new Error('File reading failed.'));
        }
      };
      reader.onerror = () => reject(new Error('File reading failed.'));
      reader.readAsArrayBuffer(file);
    });
  };

  interface TextItem {
    str: string;
    transform: number[];
    // width: number;
    // height: number;
    // fontName: string;
  }
  
  const extractTextFromPDF = async (
    pdf: ArrayBuffer | string
  ): Promise<{ textData: Map<number, unknown>; totalTextLength: number }> => {
    const pdfDoc = await pdfjs.getDocument(pdf).promise;
    const numPages = pdfDoc.numPages;
    const textData = new Map<number, unknown>();
    let totalTextLength = 0;
  
    for (let i = 1; i <= numPages; i++) {
      const page = await pdfDoc.getPage(i);
      const textContent = await page.getTextContent();
  
      const serializableTextItems = textContent.items.map((item: any): TextItem => ({
        str: item.str,
        transform: item.transform,
        // width: item.width,
        // height: item.height,
        // fontName: item.fontName,
      }));
  
      totalTextLength += textContent.items.length;
      textData.set(i, serializableTextItems);
      // textData.set(i, textContent);
    }
  
    return { textData, totalTextLength };
  };

  const performOCR = async (pdf: ArrayBuffer | string, numPages: number): Promise<Map<number, string>> => {
    const textData = new Map<number, string>();
    if (containerRef.current) {
      for (let i = 1; i <= numPages; i++) {
        const canvas: HTMLCanvasElement | null = containerRef.current.querySelector(`.pdf-page-${i} canvas`);
        if (canvas) {
          const imageData = canvas.toDataURL();
          const { data } = await Tesseract.recognize(imageData, 'eng');
          textData.set(i, data.text);
        }
      }
    }
    return textData;
  };

  const handleProcessClick = async () => {
    try {
      setIsProcessing(true);
      setError(null);

      // Step 1: Read the file if necessary
      let pdfData;
      if (typeof file === 'string') {
        pdfData = file;
      } else {
        pdfData = await handleFileRead(file);
      }

      // Step 2: Extract text using pdfjs
      const {textData, totalTextLength} = await extractTextFromPDF(pdfData);

      // Step 3: Perform OCR if necessary
      let result = textData;
      if (totalTextLength < THRESHOLD_TEXT_LENGTH * numPages) {
      // if (true) { // TODO: Remove - debugging
        const ocrData = await performOCR(pdfData, numPages);
        result = ocrData;
      } 
      setExtractedTextData(result);

      // Step 4: Invoke the callback with the extracted data
      // This is no longer handled by the onClick event on the button.

      // Serialize the data by sorting the results by page, then storing in a list.
      const extractedTextKeys = Array.from(result.keys()).sort((a, b) => a - b);
      let pageData = extractedTextKeys.map((pageIndex)=>{return `Page ${pageIndex}:\n\n` + JSON.stringify(result.get(pageIndex))});
      if (keyPhrase) {
        pageData = pageData.filter(pageString => pageString.toLowerCase().includes(keyPhrase.toLowerCase()));
      }
      const serializedData = JSON.stringify(pageData);
      callback(serializedData);
    } catch (err) {
      setError('An error occurred during processing.');
    } finally {
      setIsProcessing(false);
    }
  };

  const onDocumentLoadSuccess = ({ numPages }: { numPages: number }) => {
    setNumPages(numPages);
  };

  return (
    <div>
      <div style={{ display: 'none' }} ref={containerRef}>
        <Document
          file={file}
          onLoadSuccess={onDocumentLoadSuccess}
        >
          {Array.from(new Array(numPages), (el, index) => (
            <Page
              key={`page_${index + 1}`}
              pageNumber={index + 1}
              className={`pdf-page-${index + 1}`}
              width={1000}
            />
          ))}
        </Document>
      </div>
      {isProcessing && <><p>scanning document</p><IonSpinner /></>}
      {error && <p className="error" aria-live="assertive">{error}</p>}
      {!isProcessing && extractedTextData.size <= 0 &&  <IonButton onClick={handleProcessClick}>Start Processing</IonButton>}
      {/* Below is now handled automatically on scanning completion. */}
      {/* {!isProcessing && extractedTextData.size > 0 && <IonButton onClick={() => callback(serializedData)}>Process Data</IonButton>} */}
    </div>
  );
};

export default PDFExtractor;


// const secretPrompt1 = "Below is some text read from a PDF of an investment holdings report. Can you output a structured text document that recreates the semantic content from this parse for each page? Feel free to use code / verbatim text formatting.";

// const secretPrompt2 = "Thank you. Now please output a nicely formatted table for each account, followed by the number for the total value of that account.";