#!/usr/bin/env python3
"""
pdf_to_txt_chunks.py

Extracts text from one or more PDF files while preserving layout (line breaks, spacing,
speaker labels, and line numbers as much as the source allows), then splits the output
into ~N-page chunks (default 200) for easier downstream analysis.

Features:
- Uses pdfminer.six with tuned LAParams to better preserve layout.
- Writes a master .txt (all pages) and chunked .txt files by page range.
- Inserts page separators like: === [FILENAME] — Page 12 ===
- Detects pages with low/empty text and writes an OCR-needed report.
- Optional: Only process a page range within a PDF.
- Batch mode: accepts multiple PDF paths or a folder (with --glob).

Requirements (install locally before running):
    pip install pdfminer.six

Recommended if you have scanned/bitmap PDFs (no text layer):
    pip install ocrmypdf
    # Example usage before extraction:
    #   ocrmypdf --output-type pdfa input.pdf input_ocr.pdf

Usage examples:
    python pdf_to_txt_chunks.py "/path/clerks_transcript.pdf"
    python pdf_to_txt_chunks.py "/path/entire_transcript_combined.pdf" --pages-per-chunk 200
    python pdf_to_txt_chunks.py "/data/*.pdf" --glob
    python pdf_to_txt_chunks.py "/path/file.pdf" --start-page 1 --end-page 800

Outputs go next to the input PDF (or to --out-dir if specified).

Author: ChatGPT
"""
from __future__ import annotations

import argparse
import glob
import io
import os
import sys
from dataclasses import dataclass
from typing import Iterable, List, Optional, Tuple

from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager


@dataclass
class ExtractResult:
    pdf_path: str
    total_pages: int
    pages_with_low_text: List[int]
    output_all_txt: str
    chunk_files: List[str]


def human_name(pdf_path: str) -> str:
    base = os.path.basename(pdf_path)
    name, _ = os.path.splitext(base)
    return name


def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def extract_pdf(
    pdf_path: str,
    out_dir: str,
    pages_per_chunk: int = 200,
    start_page: Optional[int] = None,
    end_page: Optional[int] = None,
    min_chars_per_page: int = 50,
) -> ExtractResult:
    """
    Extract text per page using pdfminer.six with LAParams tuned to preserve visual layout.
    Splits output into chunks by page count.
    """
    ensure_dir(out_dir)
    name = human_name(pdf_path)

    # Tuned LAParams – adjust as needed if your PDFs space words oddly.
    laparams = LAParams(
        line_overlap=0.5,   # consider lines that overlap this fraction to be the same line
        char_margin=2.0,    # larger groups characters into words more readily
        word_margin=0.1,    # lower keeps words tighter (increase if words run together)
        line_margin=0.3,    # smaller keeps lines tighter (increase if lines merge)
        boxes_flow=0.5,     # how much to rely on detected text box order (-1..+1)
        all_texts=True,     # include figures/form-like text when possible
    )

    rsrcmgr = PDFResourceManager()

    # Determine total pages and set range
    with open(pdf_path, "rb") as fp:
        pages = list(PDFPage.get_pages(fp))
        total_pages = len(pages)

    sp = start_page if start_page is not None else 1
    ep = end_page if end_page is not None else total_pages
    if sp < 1 or ep > total_pages or sp > ep:
        raise ValueError(f"Invalid page range: {sp}-{ep} for {total_pages} total pages")

    # Prepare outputs
    all_txt_path = os.path.join(out_dir, f"{name}__ALL_p{sp:04d}-{ep:04d}.txt")
    # Chunk template: {name}__p0001-0200.txt
    chunk_files: List[str] = []
    pages_with_low_text: List[int] = []

    # Extraction loop
    # We'll accumulate per-chunk strings and flush them to files.
    current_chunk_start_page = sp
    current_chunk_end_page = min(sp + pages_per_chunk - 1, ep)
    current_chunk_lines: List[str] = []
    all_lines: List[str] = []

    def flush_chunk(start_p: int, end_p: int, lines: List[str]) -> Optional[str]:
        if not lines:
            return None
        chunk_path = os.path.join(out_dir, f"{name}__p{start_p:04d}-{end_p:04d}.txt")
        with open(chunk_path, "w", encoding="utf-8", newline="") as f:
            f.write("".join(lines))
        return chunk_path

    with open(pdf_path, "rb") as fp:
        for idx, page in enumerate(PDFPage.get_pages(fp), start=1):
            if idx < sp or idx > ep:
                continue

            # Extract text for this page to a buffer
            out_buf = io.StringIO()
            try:
                extract_text_to_fp(
                    fp=None,  # Not used when supplying PDFPage explicitly; see below workaround
                    outfp=out_buf,
                    laparams=laparams,
                    rsrcmgr=rsrcmgr,
                    page_numbers=None,  # we'll pass single-page via 'pages' arg
                    output_type="text",
                    codec="utf-8",
                    pages=[pages[idx - 1]],  # single page object
                )
            except TypeError:
                # Compatibility fallback for pdfminer versions where 'pages' is not accepted
                # Re-open and use page_numbers arg instead.
                out_buf = io.StringIO()
                with open(pdf_path, "rb") as fp2:
                    extract_text_to_fp(
                        fp2,
                        out_buf,
                        laparams=laparams,
                        rsrcmgr=rsrcmgr,
                        page_numbers=[idx - 1],
                        output_type="text",
                        codec="utf-8",
                    )

            page_text = out_buf.getvalue()

            # Insert a clear page separator to preserve pagination context
            sep = f"\n=== [{name}] — Page {idx} ===\n"
            page_block = sep + page_text

            # Track pages with very low text (likely image-only/scanned)
            if len(page_text.strip()) < min_chars_per_page:
                pages_with_low_text.append(idx)

            # Append to accumulators
            all_lines.append(page_block)
            current_chunk_lines.append(page_block)

            # If we've reached the end of the current chunk, flush it
            if idx == current_chunk_end_page:
                chunk_path = flush_chunk(current_chunk_start_page, current_chunk_end_page, current_chunk_lines)
                if chunk_path:
                    chunk_files.append(chunk_path)
                # Reset for next chunk
                current_chunk_start_page = idx + 1
                current_chunk_end_page = min(current_chunk_start_page + pages_per_chunk - 1, ep)
                current_chunk_lines = []

        # Flush any remaining pages in the last chunk
        if current_chunk_lines:
            chunk_path = flush_chunk(current_chunk_start_page, (sp + (len(all_lines) - 1)), current_chunk_lines)
            if chunk_path:
                chunk_files.append(chunk_path)

    # Write the full combined text
    with open(all_txt_path, "w", encoding="utf-8", newline="") as f_all:
        f_all.write("".join(all_lines))

    return ExtractResult(
        pdf_path=pdf_path,
        total_pages=total_pages,
        pages_with_low_text=pages_with_low_text,
        output_all_txt=all_txt_path,
        chunk_files=chunk_files,
    )


def discover_inputs(args: argparse.Namespace) -> List[str]:
    inputs: List[str] = []
    if args.glob:
        for pattern in args.inputs:
            inputs.extend(glob.glob(pattern))
    else:
        inputs = list(args.inputs)
    # Filter to files
    inputs = [p for p in inputs if os.path.isfile(p)]
    if not inputs:
        raise SystemExit("No input PDF files found. Check your paths or use --glob for patterns.")
    return inputs


def main(argv: Optional[List[str]] = None) -> int:
    parser = argparse.ArgumentParser(description="Convert PDFs to .txt and split into ~N-page chunks with layout preserved.")
    parser.add_argument("inputs", nargs="+", help="PDF file(s) or glob pattern(s) if used with --glob")
    parser.add_argument("--glob", action="store_true", help="Treat inputs as shell-style glob patterns")
    parser.add_argument("--out-dir", default="", help="Output directory (default: alongside each input PDF)")
    parser.add_argument("--pages-per-chunk", type=int, default=200, help="Number of pages per chunk (default: 200)")
    parser.add_argument("--start-page", type=int, default=None, help="Start page (1-based, inclusive)")
    parser.add_argument("--end-page", type=int, default=None, help="End page (1-based, inclusive)")
    parser.add_argument("--min-chars-per-page", type=int, default=50, help="Below this, page is flagged for OCR (default: 50)")

    args = parser.parse_args(argv)

    inputs = discover_inputs(args)

    any_errors = False
    for pdf_path in inputs:
        base_out_dir = args.out_dir if args.out_dir else os.path.dirname(os.path.abspath(pdf_path))
        # create a subfolder for each PDF to keep things tidy
        sub_out = os.path.join(base_out_dir, f"{human_name(pdf_path)}__txt_parts")
        try:
            result = extract_pdf(
                pdf_path=pdf_path,
                out_dir=sub_out,
                pages_per_chunk=args.pages_per_chunk,
                start_page=args.start_page,
                end_page=args.end_page,
                min_chars_per_page=args.min_chars_per_page,
            )
            # Write a small report
            report_path = os.path.join(sub_out, f"{human_name(pdf_path)}__report.txt")
            with open(report_path, "w", encoding="utf-8") as rf:
                rf.write(f"PDF: {pdf_path}\n")
                rf.write(f"Total pages in file: {result.total_pages}\n")
                sp = args.start_page if args.start_page else 1
                ep = args.end_page if args.end_page else result.total_pages
                rf.write(f"Processed page range: {sp}-{ep}\n")
                rf.write(f"Combined text: {result.output_all_txt}\n")
                rf.write("Chunk files:\n")
                for cf in result.chunk_files:
                    rf.write(f"  - {cf}\n")
                if result.pages_with_low_text:
                    rf.write("\nPages with low/empty text (consider OCR):\n")
                    rf.write(", ".join(str(p) for p in result.pages_with_low_text) + "\n")
                else:
                    rf.write("\nAll processed pages had sufficient text.\n")

            print(f"[OK] {pdf_path}\n  All text: {result.output_all_txt}\n  Report:  {report_path}")
        except Exception as e:
            any_errors = True
            print(f"[ERROR] {pdf_path}: {e}", file=sys.stderr)

    return 1 if any_errors else 0


if __name__ == "__main__":
    raise SystemExit(main())
