import google.generativeai as genai
import pdfplumber
import pandas as pd
import os
import re
import time
import json
from datetime import datetime


class TrustcoBankConverter:
    def __init__(self, api_key):
        """Initialize converter with Gemini API"""
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel(
            "gemini-2.5-pro",
            generation_config={
                "temperature": 0.1,
                "top_p": 0.95,
                "max_output_tokens": 8192,
            }
        )

    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF with better table detection"""
        all_text = []
        try:
            with pdfplumber.open(pdf_path) as pdf:
                print(f"📄 Total pages: {len(pdf.pages)}")

                # Try to extract as table first
                for i, page in enumerate(pdf.pages, start=1):
                    # Try table extraction first
                    tables = page.extract_tables()
                    if tables and len(tables) > 0:
                        for table in tables:
                            for row in table:
                                if any(cell for cell in row if cell):
                                    row_text = ' '.join([str(cell) for cell in row if cell])
                                    all_text.append(row_text)

                    # Also extract regular text
                    text = page.extract_text()
                    if text:
                        # Clean up the text
                        lines = text.split('\n')
                        for line in lines:
                            line = line.strip()
                            if line:
                                all_text.append(line)

                    print(f"   Processed page {i}/{len(pdf.pages)}")

        except Exception as e:
            print(f"❌ PDF extraction error: {e}")
            return ""

        full_text = '\n'.join(all_text)
        print(f"✅ Extracted {len(full_text)} characters, {len(all_text)} lines")

        # Save extracted text for debugging
        with open('extracted_text_debug.txt', 'w', encoding='utf-8') as f:
            f.write(full_text)

        return full_text

    def parse_direct_from_pdf(self, pdf_path):
        """Parse directly from PDF using pdfplumber's table extraction"""
        transactions = []

        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    print(f"📄 Processing page {page_num}...")

                    # Extract the page text
                    text = page.extract_text()

                    if text:
                        # Look for transaction patterns in the text
                        lines = text.split('\n')

                        # Look for date patterns (like 11-03 at start of line)
                        date_pattern = re.compile(r'^(\d{1,2}-\d{1,2})\s+(\d+)\s+(.+?)\s+([\d,]+\.\d{2})-?$')

                        for line in lines:
                            line = line.strip()

                            # Try to match transaction pattern
                            match = date_pattern.match(line)
                            if match:
                                date_part, trace, desc, amount = match.groups()
                                date = f"{date_part}-2025"
                                amount_num = float(amount.replace(',', ''))

                                # Check if withdrawal (ends with -)
                                is_withdrawal = line.strip().endswith('-')

                                transaction = {
                                    'Date': date,
                                    'Trace #': trace,
                                    'Description': desc[:100],
                                    'Withdraw': amount_num if is_withdrawal else 0.0,
                                    'Deposit': 0.0 if is_withdrawal else amount_num,
                                    'Account': '34309654',
                                    'Accuracy': 'Direct',
                                    'Source': 'PDF Parser'
                                }

                                transactions.append(transaction)
                                print(f"   ✓ Found: {date} | {trace} | {desc[:30]}... | ${amount_num:.2f}")

        except Exception as e:
            print(f"❌ Direct PDF parsing error: {e}")
            import traceback
            traceback.print_exc()

        return transactions

    def parse_with_gemini_smart(self, text):
        """Use Gemini AI with smarter prompt for better extraction"""
        print("🤖 Using Gemini AI for smart parsing...")

        try:
            prompt = f"""
            You are a banking data extraction expert. Extract ALL financial transactions from this Trustco Bank statement.

            CRITICAL INSTRUCTIONS:
            1. Extract EVERY transaction including dates, trace numbers, descriptions, and amounts
            2. Date format: always use MM-DD-2025
            3. Trace numbers: use the number after the date (6-7 digits), use "1" for CUSTOMER DEPOSIT
            4. Amounts: negative for withdrawals, positive for deposits
            5. Include checks from "CHECKS PAID" section too

            Here is a sample of what to extract:
            From: "11-03  9022530  PBG - G&A OU CORP PMT 2003323565 LITTLECAESA  636.39-"
            Extract: date: "11-03-2025", trace: "9022530", description: "PBG - G&A OU CORP PMT 2003323565 LITTLECAESA", amount: -636.39

            From: "11-10  1  CUSTOMER DEPOSIT  501.00"
            Extract: date: "11-10-2025", trace: "1", description: "CUSTOMER DEPOSIT", amount: 501.00

            From: "1202 11-04  1000.00" (from checks section)
            Extract: date: "11-04-2025", trace: "1202", description: "CHECK #1202", amount: -1000.00

            BANK STATEMENT TEXT:
            ```
            {text[:10000]}  # First 10k chars to avoid token limits
            ```

            Return ONLY a JSON array with transaction objects. Each object MUST have:
            - date (string, format: MM-DD-2025)
            - trace (string)
            - description (string)
            - amount (number, negative for withdrawals)

            Example response format:
            [
              {{
                "date": "11-03-2025",
                "trace": "9022530",
                "description": "PBG - G&A OU CORP PMT 2003323565 LITTLECAESA",
                "amount": -636.39
              }},
              {{
                "date": "11-03-2025",
                "trace": "9142530",
                "description": "CAESAR FUND CASHCD LC NORTHEAST SCHENECTA",
                "amount": -790.67
              }}
            ]

            IMPORTANT: Extract ALL transactions you can find. Don't miss any.
            """

            response = self.model.generate_content(prompt)
            response_text = response.text

            print("🔍 Raw Gemini response received")

            # Clean the response
            response_text = response_text.strip()

            # Remove markdown code blocks if present
            response_text = re.sub(r'```json\s*', '', response_text)
            response_text = re.sub(r'```\s*', '', response_text)

            # Find JSON array in response
            json_start = response_text.find('[')
            json_end = response_text.rfind(']') + 1

            if json_start != -1 and json_end > json_start:
                json_str = response_text[json_start:json_end]
            else:
                json_str = response_text

            print(f"📝 Parsing JSON response: {len(json_str)} characters")

            # Parse JSON
            transactions_json = json.loads(json_str)

            # Convert to our format
            transactions = []
            for txn in transactions_json:
                try:
                    amount = txn.get('amount', 0)
                    if isinstance(amount, str):
                        # Clean amount string
                        amount_clean = amount.replace(',', '').replace('$', '').strip()
                        if amount_clean.endswith('-'):
                            amount_clean = amount_clean.rstrip('-')
                            amount_num = -float(amount_clean)
                        else:
                            amount_num = float(amount_clean)
                    else:
                        amount_num = float(amount)

                    date_str = txn.get('date', '')
                    # Ensure date has year
                    if date_str and len(date_str.split('-')) == 2:
                        date_str = f"{date_str}-2025"

                    transactions.append({
                        'Date': date_str,
                        'Trace #': str(txn.get('trace', '')),
                        'Description': str(txn.get('description', ''))[:150],
                        'Amount': abs(amount_num),
                        'Withdraw': abs(amount_num) if amount_num < 0 else 0.0,
                        'Deposit': amount_num if amount_num > 0 else 0.0,
                        'Account': '34309654',
                        'Accuracy': 'AI High',
                        'Source': 'Gemini AI'
                    })
                except Exception as e:
                    print(f"⚠️  Skipping transaction due to error: {e}")
                    continue

            print(f"✅ Gemini AI successfully extracted {len(transactions)} transactions")

            # Save extracted transactions for debugging
            with open('gemini_extracted.json', 'w', encoding='utf-8') as f:
                json.dump(transactions_json, f, indent=2)

            return transactions

        except Exception as e:
            print(f"❌ Gemini AI parsing failed: {e}")
            import traceback
            traceback.print_exc()

            # Try to save what we got for debugging
            try:
                with open('error_response.txt', 'w', encoding='utf-8') as f:
                    f.write(response_text if 'response_text' in locals() else 'No response')
            except:
                pass

            return []

    def brute_force_parse(self, text):
        """Brute force parsing that looks for any transaction-like patterns"""
        print("🔄 Trying brute force parsing...")

        transactions = []

        # Common patterns in the PDF
        patterns = [
            # Pattern 1: Date Trace Description Amount-
            r'(\d{1,2}-\d{1,2})\s+(\d+)\s+(.+?)\s+([\d,]+\.\d{2})-?\s*$',
            # Pattern 2: Description Amount-
            r'(.+?)\s+([\d,]+\.\d{2})-?\s*$',
            # Pattern 3: Just amount with minus
            r'\s+([\d,]+\.\d{2})-\s*$',
            # Pattern 4: Customer deposit
            r'CUSTOMER\s+DEPOSIT\s+([\d,]+\.\d{2})\s*$'
        ]

        lines = text.split('\n')
        current_date = None

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Check if line starts with a date
            date_match = re.match(r'^(\d{1,2}-\d{1,2})\s+', line)
            if date_match:
                current_date = f"{date_match.group(1)}-2025"

            # Try to find amount in line
            amount_match = re.search(r'([\d,]+\.\d{2})-?\s*$', line)

            if amount_match and current_date:
                amount_str = amount_match.group(1)
                amount = float(amount_str.replace(',', ''))

                # Extract description (everything before amount)
                amount_start = amount_match.start()
                description = line[:amount_start].strip()

                # Try to extract trace number from description
                trace_match = re.search(r'\s(\d{6,7})\s+', description)
                trace = trace_match.group(1) if trace_match else "1"

                # Clean description
                if trace_match:
                    description = description.replace(trace_match.group(0), ' ', 1).strip()

                # Determine if withdrawal
                is_withdrawal = line.strip().endswith('-') or any(word in description.upper()
                                                                  for word in
                                                                  ['PBG', 'CAESAR', 'BLFD', 'BLUELINE', 'IRS', 'NYS',
                                                                   'TAX', 'PAYROLL', 'FEE'])

                transaction = {
                    'Date': current_date,
                    'Trace #': trace,
                    'Description': description[:100] if description else "Unknown",
                    'Withdraw': amount if is_withdrawal else 0.0,
                    'Deposit': 0.0 if is_withdrawal else amount,
                    'Account': '34309654',
                    'Accuracy': 'Low',
                    'Source': 'Brute Force'
                }

                transactions.append(transaction)

        print(f"✅ Brute force found {len(transactions)} transactions")
        return transactions

    def process_pdf(self, pdf_path, excel_path):
        """Main processing function - tries multiple methods"""
        print(f"\n{'=' * 70}")
        print(f"Processing: {pdf_path}")
        print(f"{'=' * 70}\n")

        start_time = time.time()

        # Method 1: Try direct PDF parsing first
        print("🔍 Method 1: Direct PDF parsing...")
        transactions = self.parse_direct_from_pdf(pdf_path)

        # Method 2: If direct parsing fails, use Gemini AI
        if len(transactions) < 10:
            print(f"⚠️  Direct parsing only found {len(transactions)} transactions")
            print("🔍 Method 2: Extracting text for AI parsing...")

            full_text = self.extract_text_from_pdf(pdf_path)

            if not full_text:
                print("❌ No text extracted from PDF")
                return False

            print("🔍 Method 3: Using Gemini AI...")
            ai_transactions = self.parse_with_gemini_smart(full_text)

            if ai_transactions:
                transactions = ai_transactions
            else:
                print("⚠️  Gemini AI failed, trying brute force...")
                brute_transactions = self.brute_force_parse(full_text)
                if brute_transactions:
                    transactions = brute_transactions

        if not transactions:
            print("❌ No transactions found with any method")
            return False

        print(f"✅ Total found: {len(transactions)} transactions\n")

        # Create DataFrame
        df = pd.DataFrame(transactions)

        # Define columns
        columns = ['Date', 'Trace #', 'Description', 'Withdraw', 'Deposit', 'Account', 'Accuracy', 'Source']

        # Ensure all columns exist
        for col in columns:
            if col not in df.columns:
                df[col] = ''

        # Reorder columns
        df = df[columns]

        # Clean data
        df['Withdraw'] = pd.to_numeric(df['Withdraw'], errors='coerce').fillna(0.0).round(2)
        df['Deposit'] = pd.to_numeric(df['Deposit'], errors='coerce').fillna(0.0).round(2)

        # Remove rows where both are 0
        df = df[(df['Withdraw'] > 0) | (df['Deposit'] > 0)]

        # Fill missing dates
        if 'Date' in df.columns and len(df) > 0:
            df['Date'] = df['Date'].replace('', pd.NA)
            df['Date'] = df['Date'].ffill()

            # Sort by date if possible
            try:
                df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
                df = df.sort_values('Date')
                df['Date'] = df['Date'].dt.strftime('%m-%d-%Y')
            except:
                # Keep as is if conversion fails
                pass

        # Save to Excel
        try:
            with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='Transactions', index=False)

                # Create summary
                summary_data = {
                    'Metric': [
                        'Total Transactions',
                        'Total Withdrawals',
                        'Total Deposits',
                        'Net Amount'
                    ],
                    'Value': [
                        len(df),
                        f"${df['Withdraw'].sum():,.2f}",
                        f"${df['Deposit'].sum():,.2f}",
                        f"${df['Deposit'].sum() - df['Withdraw'].sum():,.2f}"
                    ]
                }
                pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)

            elapsed = time.time() - start_time

            # Display results
            print(f"{'=' * 70}")
            print(f"✅ SUCCESS!")
            print(f"{'=' * 70}")
            print(f"📊 Transactions: {len(df)}")
            print(f"💰 Total Withdrawals: ${df['Withdraw'].sum():,.2f}")
            print(f"💰 Total Deposits: ${df['Deposit'].sum():,.2f}")
            print(f"⚖️  Net: ${df['Deposit'].sum() - df['Withdraw'].sum():,.2f}")
            print(f"⏱️  Time: {elapsed:.1f} seconds")
            print(f"📁 Saved: {excel_path}")
            print(f"{'=' * 70}\n")

            # Show sample transactions
            if len(df) > 0:
                print("📋 Sample transactions:")
                print("-" * 80)
                for i, (_, row) in enumerate(df.head(5).iterrows()):
                    date_display = row['Date'] if pd.notna(row['Date']) else "NO DATE"
                    desc_short = str(row['Description'])[:40] + '...' if len(str(row['Description'])) > 40 else str(
                        row['Description'])
                    print(
                        f"{date_display:12} | {row['Trace #']:8} | {desc_short:45} | W:{row['Withdraw']:8.2f} | D:{row['Deposit']:8.2f}")
                print("-" * 80 + "\n")

            return True

        except Exception as e:
            print(f"❌ Error saving Excel: {e}")
            import traceback
            traceback.print_exc()
            return False


def extract_pdf_to_excel(pdf_path, excel_path, api_key):
    """
    Wrapper function for Flask compatibility
    """
    try:
        converter = TrustcoBankConverter(api_key)
        return converter.process_pdf(pdf_path, excel_path)
    except Exception as e:
        print(f"❌ Conversion error: {e}")
        import traceback
        traceback.print_exc()
        return False


def main():
    """Standalone execution"""
    API_KEY = "your_api_key_here"  # Replace with your actual API key

    pdf_files = ["Schenectady Nov.pdf"]

    for pdf_file in pdf_files:
        if not os.path.exists(pdf_file):
            print(f"❌ File not found: {pdf_file}")
            continue

        excel_file = pdf_file.replace('.pdf', '_converted.xlsx')
        success = extract_pdf_to_excel(pdf_file, excel_file, API_KEY)

        if success:
            print(f"✅ Conversion successful!")
        else:
            print(f"❌ Conversion failed for {pdf_file}")


if __name__ == "__main__":
    main()