import google.generativeai as genai
import pdfplumber
import pandas as pd
import json
import os
import time
import re


def extract_data_with_gemini_correct(pdf_path, output_excel_path, api_key, max_pages=None):
    """CORRECT solution using Gemini API with proper prompt for Broadview statements."""
    print(f"Starting CORRECT Gemini extraction for: {pdf_path}")

    if not os.path.exists(pdf_path):
        print(f"❌ File not found: {pdf_path}")
        return False

    try:
        # Configure Gemini
        genai.configure(api_key=api_key)

        # Use gemini-1.5-flash for speed
        model = genai.GenerativeModel('gemini-2.5-pro')

        all_transactions = []

        # Open PDF and process each page
        with pdfplumber.open(pdf_path) as pdf:
            total_pages = len(pdf.pages)
            
            # Application of page limit
            pages_to_process = pdf.pages
            if max_pages and total_pages > max_pages:
                print(f"⚠️ Page limit applied: only processing first {max_pages} pages.")
                pages_to_process = pdf.pages[:max_pages]
                total_pages = max_pages

            for page_num, page in enumerate(pages_to_process, 1):
                print(f"📄 Processing page {page_num}/{total_pages}...")

                # Convert page to image (low resolution for speed)
                img = page.to_image(resolution=120).original

                # Create the CORRECT prompt
                prompt = [
                    """EXTRACT ALL TRANSACTIONS from this Broadview Federal Credit Union BUSINESS EASE CHECKING statement.

                    FORMAT: JSON array with objects containing these EXACT fields:
                    {
                        "Date": "MM/DD",
                        "Description": "Transaction description text ONLY (no amounts)",
                        "TransactionAmount": "Amount with sign (e.g., '2282.95' or '-125.50')",
                        "Balance": "Ending balance"
                    }

                    IMPORTANT RULES:
                    1. Date format MUST be MM/DD (e.g., "06/02")
                    2. Description field should contain ONLY the transaction text WITHOUT any amounts
                    3. TransactionAmount MUST include the sign:
                       - POSITIVE for deposits (e.g., "2282.95", "20.09")
                       - NEGATIVE for withdrawals, checks, fees (e.g., "-125.50", "-28.00", "-1564.60")
                    4. Include ALL transactions including:
                       - External Deposits
                       - Checks (extract check number from description if present)
                       - Withdrawals
                       - Fees
                       - Overdraft protection
                    5. DO NOT include header/footer information
                    6. DO NOT include summary tables
                    7. Extract from BUSINESS EASE CHECKING account ONLY

                    EXAMPLE TRANSACTIONS from your PDF:
                    - "06/02", "External Deposit PAYARC - CR CD DEP 567000000857912", "2282.95", "3223.62"
                    - "06/02", "Eff.05-30 External Withdrawal IRS - USATAXPYMT 270555002181665", "-125.50", "4104.32"
                    - "06/02", "Check 136", "-206.00", "172.44"
                    - "06/02", "Insufficient Funds Charge External Withdrawal (Paid)", "-28.00", "4076.32"

                    Return ONLY the JSON array, nothing else.""",
                    img
                ]

                # Try with retry logic
                max_retries = 3
                response = None

                for attempt in range(max_retries):
                    try:
                        print(f"   Attempt {attempt + 1}...")
                        response = model.generate_content(
                            prompt,
                            generation_config={
                                "temperature": 0.1,
                                "max_output_tokens": 4000,
                            }
                        )

                        if response and response.text:
                            break
                        else:
                            print(f"   Empty response, retrying...")

                    except Exception as e:
                        error_msg = str(e)
                        print(f"   Attempt {attempt + 1} failed: {error_msg}")

                        if "504" in error_msg or "timeout" in error_msg.lower():
                            if attempt < max_retries - 1:
                                wait_time = (attempt + 1) * 15
                                print(f"   ⏳ Waiting {wait_time} seconds...")
                                time.sleep(wait_time)
                                continue
                        break

                if not response or not response.text:
                    print(f"❌ Failed to get response for page {page_num}")
                    continue

                # Parse the response
                response_text = response.text.strip()
                print(f"   Response length: {len(response_text)} characters")

                # Extract JSON from response
                json_data = extract_json_from_text(response_text)

                if json_data:
                    print(f"✅ Successfully parsed {len(json_data)} transactions from page {page_num}")

                    # Process each transaction
                    for item in json_data:
                        if isinstance(item, dict):
                            transaction = process_gemini_transaction(item)
                            if transaction:
                                all_transactions.append(transaction)
                else:
                    print(f"⚠️ Could not parse JSON from page {page_num}")
                    print(f"   Response preview: {response_text[:200]}...")

                    # Try to extract using regex as fallback
                    fallback_transactions = extract_fallback_transactions(response_text)
                    all_transactions.extend(fallback_transactions)
                    print(f"   Fallback extracted {len(fallback_transactions)} transactions")

                # Add delay between pages to avoid rate limiting
                if page_num < total_pages:
                    time.sleep(5)

        print(f"\n📊 Total transactions extracted: {len(all_transactions)}")

        if not all_transactions:
            print("❌ No transactions extracted")
            return False

        # Create DataFrame
        df = create_dataframe_from_transactions(all_transactions)

        # Save to Excel
        success = save_to_excel(df, output_excel_path)

        return success

    except Exception as e:
        print(f"❌ Error in Gemini extraction: {e}")
        return False


def extract_json_from_text(text):
    """Extract JSON array from text response."""
    try:
        # Look for JSON array
        start = text.find('[')
        end = text.rfind(']') + 1

        if start != -1 and end != 0 and end > start:
            json_str = text[start:end]
            return json.loads(json_str)

        # Try to find JSON object
        start = text.find('{')
        end = text.rfind('}') + 1

        if start != -1 and end != 0 and end > start:
            json_str = text[start:end]
            data = json.loads(json_str)
            return [data] if isinstance(data, dict) else data

    except json.JSONDecodeError as e:
        print(f"JSON decode error: {e}")
        # Try to fix common JSON issues
        try:
            # Remove trailing commas
            text = re.sub(r',\s*}', '}', text)
            text = re.sub(r',\s*]', ']', text)
            return json.loads(text)
        except:
            return None
    except Exception as e:
        print(f"Error parsing JSON: {e}")

    return None


def process_gemini_transaction(item):
    """Process a transaction from Gemini response."""
    try:
        date = item.get("Date", "").strip()
        description = item.get("Description", "").strip()
        amount_str = item.get("TransactionAmount", "").strip()
        balance = item.get("Balance", "").strip()

        if not date or not description or not amount_str:
            return None

        # Clean amount - remove $ and commas
        clean_amount = amount_str.replace('$', '').replace(',', '').strip()

        # Determine Debit and Credit
        try:
            amount_num = float(clean_amount)
            if amount_num >= 0:
                debit = clean_amount
                credit = ""
            else:
                debit = ""
                credit = str(abs(amount_num))
        except:
            debit = ""
            credit = ""

        # Extract check number
        cheque_num = ""
        check_match = re.search(r'[Cc]heck\s*#?\s*(\d+)', description, re.IGNORECASE)
        if check_match:
            cheque_num = check_match.group(1)
        else:
            # Look for "Check ###" pattern
            check_match2 = re.search(r'\b[Cc]heck\s+(\d{2,})\b', description)
            if check_match2:
                cheque_num = check_match2.group(1)

        # Clean description (remove "Eff. MM-DD " prefix)
        clean_desc = re.sub(r'^Eff\.?\s*\d{1,2}-\d{1,2}\s*', '', description, flags=re.IGNORECASE)
        clean_desc = re.sub(r'^EFF\.?\s*\d{1,2}-\d{1,2}\s*', '', clean_desc, flags=re.IGNORECASE)

        return {
            "Date": date,
            "Cheque #": cheque_num,
            "Description": clean_desc.strip(),
            "Debit": debit,
            "Credit": credit,
            "Payee": "",
            "Account": "1802579136",
            "OriginalAmount": clean_amount,
            "ExtractedBalance": balance
        }

    except Exception as e:
        print(f"Error processing transaction: {e}")
        return None


def extract_fallback_transactions(text):
    """Fallback extraction using regex patterns."""
    transactions = []

    # Pattern for transaction lines
    lines = text.split('\n')

    for line in lines:
        line = line.strip()

        # Look for patterns like: "06/02", "External Deposit...", "2282.95", "3223.62"
        pattern1 = r'"([^"]+)",\s*"([^"]+)",\s*"([^"]+)",\s*"([^"]+)"'
        matches1 = re.findall(pattern1, line)

        for match in matches1:
            if len(match) == 4:
                date, desc, amount, balance = match
                trans = create_transaction_from_parts(date, desc, amount, balance)
                if trans:
                    transactions.append(trans)

        # Look for JSON-like objects
        pattern2 = r'\{"Date":\s*"([^"]+)",\s*"Description":\s*"([^"]+)",\s*"TransactionAmount":\s*"([^"]+)",\s*"Balance":\s*"([^"]+)"\}'
        matches2 = re.findall(pattern2, line)

        for match in matches2:
            if len(match) == 4:
                date, desc, amount, balance = match
                trans = create_transaction_from_parts(date, desc, amount, balance)
                if trans:
                    transactions.append(trans)

    return transactions


def create_transaction_from_parts(date, desc, amount, balance):
    """Create transaction from extracted parts."""
    try:
        # Clean amount
        clean_amount = amount.replace('$', '').replace(',', '').strip()

        # Determine Debit/Credit
        try:
            amount_num = float(clean_amount)
            if amount_num >= 0:
                debit = clean_amount
                credit = ""
            else:
                debit = ""
                credit = str(abs(amount_num))
        except:
            debit = ""
            credit = ""

        # Extract check number
        cheque_num = ""
        check_match = re.search(r'[Cc]heck\s*#?\s*(\d+)', desc, re.IGNORECASE)
        if check_match:
            cheque_num = check_match.group(1)

        return {
            "Date": date.strip(),
            "Cheque #": cheque_num,
            "Description": desc.strip(),
            "Debit": debit,
            "Credit": credit,
            "Payee": "",
            "Account": "1802579136",
            "OriginalAmount": clean_amount,
            "ExtractedBalance": balance.strip()
        }
    except:
        return None


def create_dataframe_from_transactions(transactions):
    """Create DataFrame from transactions list."""
    df = pd.DataFrame(transactions)

    # Ensure all required columns exist
    required_columns = ["Date", "Cheque #", "Description", "Debit", "Credit", "Payee", "Account"]
    for col in required_columns:
        if col not in df.columns:
            df[col] = ""

    # Convert numeric columns
    df["Debit"] = pd.to_numeric(df["Debit"], errors="coerce").fillna(0)
    df["Credit"] = pd.to_numeric(df["Credit"], errors="coerce").fillna(0)

    # Remove helper columns
    for col in ["OriginalAmount", "ExtractedBalance"]:
        if col in df.columns:
            df = df.drop(columns=[col])

    # Sort by date
    try:
        df['SortDate'] = pd.to_datetime(df['Date'] + '/2025', format='%m/%d/%Y', errors='coerce')
        df = df.sort_values('SortDate')
        df = df.drop(columns=['SortDate'])
    except:
        pass

    # Remove duplicates
    df = df.drop_duplicates(subset=['Date', 'Description', 'Debit', 'Credit'])

    # Reorder columns
    df = df[required_columns]

    return df


def save_to_excel(df, output_path):
    """Save DataFrame to Excel with formatting."""
    try:
        with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name='Transactions', index=False)

            workbook = writer.book
            worksheet = writer.sheets['Transactions']

            # Format headers
            header_format = workbook.add_format({
                'bold': True,
                'bg_color': '#4F81BD',
                'font_color': 'white',
                'border': 1,
                'align': 'center'
            })

            # Apply header format
            for col_num, value in enumerate(df.columns.values):
                worksheet.write(0, col_num, value, header_format)

            # Format Debit and Credit columns
            money_format = workbook.add_format({'num_format': '#,##0.00'})

            debit_col = df.columns.get_loc("Debit")
            credit_col = df.columns.get_loc("Credit")

            for row in range(1, len(df) + 1):
                debit_val = df.iloc[row - 1]["Debit"]
                credit_val = df.iloc[row - 1]["Credit"]

                if debit_val != 0:
                    worksheet.write(row, debit_col, debit_val, money_format)
                if credit_val != 0:
                    worksheet.write(row, credit_col, credit_val, money_format)

            # Auto-adjust column widths
            for i, col in enumerate(df.columns):
                max_len = max(df[col].astype(str).map(len).max(), len(col)) + 2
                worksheet.set_column(i, i, min(max_len, 50))

            # Add summary
            summary_row = len(df) + 2
            summary_format = workbook.add_format({
                'bold': True,
                'bg_color': '#F2F2F2',
                'border': 1
            })

            worksheet.write(summary_row, 0, "SUMMARY", summary_format)
            worksheet.write(summary_row + 1, 0, "Total Debits (Deposits):")
            worksheet.write(summary_row + 1, 1, df['Debit'].sum(), money_format)
            worksheet.write(summary_row + 2, 0, "Total Credits (Withdrawals/Fees):")
            worksheet.write(summary_row + 2, 1, df['Credit'].sum(), money_format)
            worksheet.write(summary_row + 3, 0, "Transaction Count:")
            worksheet.write(summary_row + 3, 1, len(df))
            worksheet.write(summary_row + 4, 0, "Net Change:")
            net_change = df['Debit'].sum() - df['Credit'].sum()
            worksheet.write(summary_row + 4, 1, net_change, money_format)

        print(f"✅ Excel file saved: {output_path}")
        print(f"📊 Summary: {len(df)} transactions")
        print(f"💰 Total Debits: ${df['Debit'].sum():,.2f}")
        print(f"💰 Total Credits: ${df['Credit'].sum():,.2f}")
        print(f"📈 Net Change: ${net_change:,.2f}")

        return True

    except Exception as e:
        print(f"❌ Error saving Excel: {e}")
        return False


# ALTERNATIVE: SIMPLE TEXT EXTRACTION (No AI)
def extract_with_simple_text(pdf_path, output_excel_path):
    """Simple text extraction without AI - might work better for some PDFs."""
    print("Using simple text extraction...")

    transactions = []

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if not text:
                    continue

                lines = text.split('\n')

                for line in lines:
                    line = line.strip()

                    # Look for transaction pattern with 4 columns
                    # Using more specific pattern for Broadview statements
                    pattern = r'^(\d{1,2}/\d{1,2})\s+(.+?)\s+(-?\$?\d{1,3}(?:,\d{3})*\.\d{2})\s+(-?\$?\d{1,3}(?:,\d{3})*\.\d{2})$'
                    match = re.search(pattern, line)

                    if match:
                        date, desc, amount, balance = match.groups()

                        # Process the transaction
                        trans = create_transaction_from_parts(date, desc, amount, balance)
                        if trans:
                            transactions.append(trans)
                            print(f"Found: {date} - {desc[:30]}... - {amount}")

    except Exception as e:
        print(f"Error in simple extraction: {e}")

    if transactions:
        df = pd.DataFrame(transactions)
        df = df[["Date", "Cheque #", "Description", "Debit", "Credit", "Payee", "Account"]]
        df["Debit"] = pd.to_numeric(df["Debit"], errors="coerce").fillna(0)
        df["Credit"] = pd.to_numeric(df["Credit"], errors="coerce").fillna(0)
        df.to_excel(output_excel_path, index=False)
        print(f"✅ Simple extraction saved {len(df)} transactions")
        return True

    return False


# MAIN FUNCTION TO USE
def extract_data_with_gemini_full(pdf_path, output_excel_path, api_key, max_pages=None):
    """Main function - tries Gemini first, falls back to simple extraction."""

    print("=" * 60)
    print("BROADVIEW FEDERAL CREDIT UNION STATEMENT EXTRACTION")
    print("=" * 60)

    # First try with Gemini
    print("\n[1/2] Trying Gemini API extraction...")
    if api_key and extract_data_with_gemini_correct(pdf_path, output_excel_path, api_key, max_pages=max_pages):
        return True

    # If Gemini fails, try simple extraction
    print("\n[2/2] Gemini failed, trying simple text extraction...")
    if extract_with_simple_text(pdf_path, output_excel_path):
        return True

    print("\n❌ All extraction methods failed")
    return False