import google.generativeai as genai
import pdfplumber
import pandas as pd
from PIL import Image
import io
import os
import json
import re

try:
    from PIL import PngImagePlugin
except ImportError:
    pass


def extract_page_with_gemini(page_image, model):
    """
    Sends a page image to Gemini and asks it to extract only transaction data
    with columns: Date, Description, Debit, Credit
    """
    prompt = """
    You are a bank statement PDF-to-Excel converter.
    Extract ONLY transaction data from this PDF page.

    REQUIRED FORMAT:
    - Return ONLY valid JSON array of objects.
    - Each object = one transaction row.
    - Each object must have EXACTLY these keys: "Date", "Description", "Debit", "Credit"
    - If Debit is empty or doesn't exist, use empty string ""
    - If Credit is empty or doesn't exist, use empty string ""
    - Date should be in MM/DD/YYYY or M/D/YYYY format if available
    - Description should be the full transaction description
    - Skip ALL other information (headers, footers, summaries, totals, etc.)
    - Only extract actual transaction lines with dates and amounts

    Examples:
    - Transaction: {"Date": "09/02/2025", "Description": "Point Of Sale Withdrawal STEWART S SHOP 129 CATSKILL NYUS", "Debit": "25.11", "Credit": ""}
    - Payment: {"Date": "09/24/2025", "Description": "BA ELECTRONIC PAYMENT", "Debit": "", "Credit": "4107.68"}

    Now extract ONLY transaction data from this page:
    """

    response = model.generate_content([prompt, page_image])
    raw_text = response.text.strip()

    # Try to parse JSON safely
    try:
        # Look for JSON array pattern
        match = re.search(r'\[.*\]', raw_text, re.DOTALL)
        if match:
            json_str = match.group(0)
            # Clean the JSON string
            json_str = json_str.replace('\n', ' ').replace('\r', ' ')
            # Remove any trailing commas before closing brackets
            json_str = re.sub(r',\s*\]', ']', json_str)
            json_str = re.sub(r',\s*\}', '}', json_str)
            data = json.loads(json_str)

            # Filter to ensure we only have the required columns
            cleaned_data = []
            for item in data:
                # Ensure we have all required keys
                cleaned_item = {
                    "Date": item.get("Date", ""),
                    "Description": item.get("Description", ""),
                    "Debit": item.get("Debit", ""),
                    "Credit": item.get("Credit", "")
                }
                # Only add if we have at least a date or description
                if cleaned_item["Date"] or cleaned_item["Description"]:
                    cleaned_data.append(cleaned_item)

            return cleaned_data
    except Exception as e:
        print(f"JSON parsing error: {e}")
        print(f"Raw response: {raw_text[:500]}...")
        return []

    return []


def convert_pdf_to_excel(pdf_path, excel_path, api_key):
    if not os.path.exists(pdf_path):
        print(f"❌ ERROR: File '{pdf_path}' not found.")
        return False

    print(f"📄 Processing {pdf_path}...")

    # Configure Gemini
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-2.5-pro")

    all_transactions = []

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            print(f"🔍 Processing page {i} with Gemini Vision...")

            # Convert page to image for Gemini Vision
            try:
                pil_img = page.to_image(resolution=200).original
            except AttributeError:
                # Alternative method if to_image fails
                pil_img = page.to_image(resolution=200)

            page_transactions = extract_page_with_gemini(pil_img, model)
            if page_transactions:
                all_transactions.extend(page_transactions)
                print(f"✅ Page {i}: extracted {len(page_transactions)} transactions")
            else:
                print(f"⚠️ Page {i}: no transactions extracted")

    if not all_transactions:
        print("❌ No transaction data extracted from the PDF.")
        return False

    # Convert to DataFrame
    df = pd.DataFrame(all_transactions)

    # Clean up the data
    # Remove empty rows
    df = df.dropna(how='all')

    # Convert Debit and Credit to numeric where possible
    for col in ['Debit', 'Credit']:
        df[col] = pd.to_numeric(df[col].replace('', None), errors='coerce')
        df[col] = df[col].fillna('').astype(str)
        # Remove .0 from whole numbers
        df[col] = df[col].apply(lambda x: x.replace('.0', '') if x.endswith('.0') else x)

    # Save to Excel
    try:
        with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
            df.to_excel(writer, sheet_name="Transactions", index=False)

            # Auto-adjust column widths
            worksheet = writer.sheets["Transactions"]
            for column in worksheet.columns:
                max_length = 0
                column_letter = column[0].column_letter
                for cell in column:
                    try:
                        if len(str(cell.value)) > max_length:
                            max_length = len(str(cell.value))
                    except:
                        pass
                adjusted_width = min(max_length + 2, 50)  # Cap at 50 characters
                worksheet.column_dimensions[column_letter].width = adjusted_width

        print(f"\n🎉 SUCCESS: Saved {len(df)} transactions to {excel_path}")
        print(f"📊 Columns: {', '.join(df.columns.tolist())}")
        return True

    except Exception as e:
        print(f"❌ Error saving to Excel: {e}")
        return False


# If you want to run this directly, you can add:
if __name__ == "__main__":
    # Example usage
    pdf_file = "11-12-2025 OCTOBER BOA CC 3278 STATEMENT.pdf"
    excel_file = "transactions.xlsx"
    api_key = "YOUR_GEMINI_API_KEY_HERE"  # Replace with your actual API key

    success = convert_pdf_to_excel(pdf_file, excel_file, api_key)
    if success:
        print("✅ Conversion complete!")
    else:
        print("❌ Conversion failed.")