# AI Features for KSeF

**GENERAL NOTE:** All code in this document is **conceptual reference architecture** — implementation patterns for the user to adapt in their own system. This skill does NOT run ML models, does NOT perform inference and does NOT require Python, sklearn, pandas or any other runtime dependencies. The agent uses these patterns solely as a knowledge base for explaining algorithms, designing pipelines and helping write code.

**Dependencies required for implementation (NOT dependencies of this skill):** sklearn, pandas, numpy — to be installed by the user in their environment.

All AI/ML features are supportive in nature and require supervision by accounting staff. Performance metrics are design goals and may vary. AI systems do not make binding tax decisions.

---

## Expense Classification

### Algorithm (High-Level)

```python
def classify_expense(invoice_data):
    """
    Expense classification based on multiple data sources
    """
    features = {
        'seller_name': invoice_data.seller_name,
        'item_names': [item.name for item in invoice_data.items],
        'pkwiu_codes': [item.pkwiu for item in invoice_data.items if item.pkwiu],
        'total_amount': invoice_data.total_gross,
        'seller_nip': invoice_data.seller_nip
    }

    # 1. Contractor history (highest priority)
    historical = get_historical_category(features['seller_nip'])
    if historical and historical.confidence > 0.9:
        return historical.category, historical.confidence

    # 2. Keyword matching
    keyword_match = match_keywords(features['item_names'])
    if keyword_match and keyword_match.confidence > 0.85:
        return keyword_match.category, keyword_match.confidence

    # 3. ML model (Random Forest / Neural Network)
    ml_prediction = ml_model.predict(features)

    # 4. Flag for review if low confidence
    if ml_prediction.confidence < 0.8:
        flag_for_manual_review(invoice_data)

    return ml_prediction.category, ml_prediction.confidence
```

### Expense Categories (Examples)

```python
COST_CATEGORIES = {
    # External services
    400: "External services (general)",
    401: "Transport services",
    402: "IT services (hosting, development, IT support)",
    403: "Legal and advisory services",
    404: "Rental and lease services",
    405: "Marketing and advertising services",
    406: "Accounting services",
    407: "Consulting services",

    # Materials and raw materials
    500: "Materials and raw materials (general)",
    501: "Energy, water, fuel",
    502: "Office supplies",
    503: "Spare parts",

    # Other
    600: "Salaries and related costs",
    700: "Depreciation",
}
```

### Keywords (Examples)

```python
KEYWORDS = {
    402: ["hosting", "server", "cloud", "AWS", "Azure", "development",
          "programming", "IT support", "software", "license"],
    405: ["advertising", "marketing", "Google Ads", "Facebook Ads",
          "social media", "SEO", "content"],
    501: ["energy", "electricity", "gas", "water", "fuel", "gasoline"],
    502: ["paper", "pen", "toner", "office", "supplies"],
}
```

### ML Model Training

```python
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

class ExpenseClassifier:
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=200, random_state=42)
        self.vectorizer = TfidfVectorizer(max_features=500)

    def train(self, historical_invoices):
        """
        Training on historical data
        """
        # Prepare data
        X_text = [
            f"{inv.seller_name} {' '.join(inv.item_names)}"
            for inv in historical_invoices
        ]
        X_vectors = self.vectorizer.fit_transform(X_text)

        y = [inv.category for inv in historical_invoices]

        # Train model
        self.model.fit(X_vectors, y)

    def predict(self, invoice):
        """
        Category prediction
        """
        X_text = f"{invoice.seller_name} {' '.join(invoice.item_names)}"
        X_vector = self.vectorizer.transform([X_text])

        prediction = self.model.predict(X_vector)[0]
        probabilities = self.model.predict_proba(X_vector)[0]
        confidence = max(probabilities)

        return {
            'category': prediction,
            'confidence': confidence,
            'alternatives': self._get_alternatives(probabilities)
        }
```

---

## Anomaly and Fraud Detection

### Anomaly Detection (Isolation Forest)

```python
from sklearn.ensemble import IsolationForest
import numpy as np

class FraudDetector:
    def __init__(self):
        self.model = IsolationForest(contamination=0.05, random_state=42)
        self.is_trained = False

    def extract_features(self, invoice):
        """Feature extraction for analysis"""
        return np.array([
            invoice.total_gross,
            len(invoice.items),
            invoice.items_avg_price,
            invoice.payment_term_days,
            invoice.hour_of_day,  # 0-23
            int(invoice.is_weekend),  # 0 or 1
            invoice.seller_transaction_count,
            invoice.seller_avg_amount,
            invoice.amount_vs_avg_ratio  # current / average
        ])

    def train(self, historical_invoices):
        """Training on historical data"""
        features = [self.extract_features(inv) for inv in historical_invoices]
        self.model.fit(features)
        self.is_trained = True

    def detect(self, invoice):
        """Anomaly detection"""
        if not self.is_trained:
            return {'anomaly': False, 'reason': 'Model not trained'}

        features = self.extract_features(invoice)
        prediction = self.model.predict([features])[0]

        if prediction == -1:  # Anomaly
            return {
                'anomaly': True,
                'risk_level': 'HIGH',
                'reasons': self._analyze_reasons(invoice),
                'action': 'MANUAL_REVIEW_REQUIRED'
            }

        return {'anomaly': False}

    def _analyze_reasons(self, invoice):
        """Analyze anomaly reasons"""
        reasons = []

        if invoice.total_gross > invoice.seller_avg_amount * 3:
            reasons.append("Amount 3x greater than average from this seller")

        if invoice.is_weekend and invoice.hour_of_day < 6:
            reasons.append("Issued at night on weekend (unusual)")

        if invoice.seller_transaction_count == 1:
            reasons.append("First contact with this seller")

        if invoice.payment_term_days < 3:
            reasons.append("Very short payment term (possible phishing)")

        return reasons
```

### Phishing Invoice Detection

```python
def detect_phishing_invoice(invoice):
    """
    Detects potential phishing invoices
    """
    score = 0
    reasons = []

    # 1. Similar name to known contractor
    similar = find_similar_contractor_names(invoice.seller_name)
    for known_contractor in similar:
        if known_contractor.nip != invoice.seller_nip:
            score += 30
            reasons.append(f"Similar name to {known_contractor.name} but different NIP")

        if known_contractor.bank_account != invoice.bank_account:
            score += 40
            reasons.append("Different bank account than known contractor")

    # 2. Short payment term
    if invoice.payment_term_days <= 2:
        score += 20
        reasons.append("Very short payment term (typical for phishing)")

    # 3. First contact
    if get_transaction_count(invoice.seller_nip) == 0:
        score += 10
        reasons.append("First time from this seller")

    # 4. High amount on first contact
    if score > 0 and invoice.total_gross > 10000:
        score += 15
        reasons.append("High amount on first/suspicious contact")

    if score >= 50:
        return {
            'phishing_detected': True,
            'risk': 'CRITICAL',
            'score': score,
            'reasons': reasons,
            'action': 'BLOCK_PAYMENT_AND_VERIFY'
        }

    return {'phishing_detected': False}
```

### VAT Carousel Detection

```python
def detect_vat_carousel(invoices, time_window_days=30):
    """
    Detects potential VAT carousel patterns
    """
    # Build transaction graph
    graph = build_transaction_graph(invoices)

    # Look for cycles (A -> B -> C -> A)
    cycles = find_cycles(graph)

    suspicious = []
    for cycle in cycles:
        # Check suspicious characteristics
        if is_suspicious_cycle(cycle):
            suspicious.append({
                'cycle': cycle,
                'risk': 'CRITICAL',
                'participants': [node.nip for node in cycle],
                'total_value': sum(edge.amount for edge in cycle),
                'time_span_days': get_cycle_duration(cycle),
                'action': 'REPORT_TO_TAX_OFFICE'
            })

    return suspicious

def is_suspicious_cycle(cycle):
    """Is the cycle suspicious?"""
    # 1. Cycle closes within <30 days
    if get_cycle_duration(cycle) > 30:
        return False

    # 2. Similar amounts (+/-10%)
    amounts = [edge.amount for edge in cycle]
    if max(amounts) / min(amounts) > 1.1:
        return False

    # 3. Same goods/services
    items = [edge.item_description for edge in cycle]
    if not all_similar(items):
        return False

    return True
```

---

## Cash Flow Prediction

### Predictive Model

```python
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

class CashFlowPredictor:
    def __init__(self):
        self.model = RandomForestRegressor(n_estimators=100, random_state=42)

    def prepare_training_data(self, historical_data):
        """
        Training data preparation
        DataFrame with columns:
        - invoice_due_date, invoice_amount, contractor_nip
        - payment_term_days, actual_payment_date, days_late
        """
        X = historical_data[[
            'invoice_amount',
            'payment_term_days',
            'contractor_avg_days_late',
            'contractor_payment_reliability',  # % on time
            'month',
            'is_end_of_quarter'
        ]]

        y = historical_data['days_late']

        return X, y

    def train(self, historical_data):
        X, y = self.prepare_training_data(historical_data)
        self.model.fit(X, y)

    def predict_payment_date(self, invoice):
        """Predict actual payment date"""
        contractor_stats = get_contractor_stats(invoice.buyer_nip)

        features = pd.DataFrame([{
            'invoice_amount': invoice.total_gross,
            'payment_term_days': invoice.payment_term_days,
            'contractor_avg_days_late': contractor_stats['avg_days_late'],
            'contractor_payment_reliability': contractor_stats['reliability'],
            'month': invoice.issue_date.month,
            'is_end_of_quarter': invoice.issue_date.month in [3, 6, 9, 12]
        }])

        predicted_days_late = self.model.predict(features)[0]
        predicted_date = invoice.payment_due_date + timedelta(days=int(predicted_days_late))

        return {
            'predicted_payment_date': predicted_date,
            'expected_days_late': int(predicted_days_late),
            'confidence': self._calculate_confidence(features)
        }

    def predict_monthly_cash_flow(self, year, month):
        """Monthly forecast"""
        # Sales invoices due in this month
        sales_invoices = get_invoices_due_in_month(year, month, type='sales')

        predicted_income = 0
        for invoice in sales_invoices:
            prediction = self.predict_payment_date(invoice)
            # Only if predicted payment is in this month
            if prediction['predicted_payment_date'].month == month:
                predicted_income += invoice.total_gross

        # Purchase invoices
        purchase_invoices = get_invoices_due_in_month(year, month, type='purchases')
        predicted_expenses = sum(inv.total_gross for inv in purchase_invoices)

        return {
            'month': f"{year}-{month:02d}",
            'predicted_income': predicted_income,
            'predicted_expenses': predicted_expenses,
            'net_cash_flow': predicted_income - predicted_expenses,
            'note': 'Prediction is an estimate'
        }
```

### Contractor Statistics

```python
def get_contractor_stats(nip):
    """
    Calculate payment statistics for a contractor
    """
    invoices = get_all_invoices_for_contractor(nip)

    if not invoices:
        return {
            'avg_days_late': 0,
            'reliability': 1.0,  # No history = optimistic assumption
            'total_invoices': 0
        }

    days_late_list = []
    paid_on_time = 0

    for inv in invoices:
        if inv.payment_date:
            days_late = (inv.payment_date - inv.payment_due_date).days
            days_late_list.append(max(0, days_late))  # Only positive

            if days_late <= 0:
                paid_on_time += 1

    return {
        'avg_days_late': sum(days_late_list) / len(days_late_list),
        'reliability': paid_on_time / len(invoices),
        'total_invoices': len(invoices),
        'max_days_late': max(days_late_list) if days_late_list else 0
    }
```

---

## AI Best Practices

### 1. Continuous Learning

```python
def retrain_models_monthly():
    """
    Retrain models monthly on fresh data
    """
    # Get data from last 12 months
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365)

    historical_data = get_invoices(start_date, end_date)

    # Retrain expense classifier
    expense_classifier.train(historical_data)

    # Retrain anomaly detector
    fraud_detector.train(historical_data)

    # Retrain cash flow predictor
    cash_flow_predictor.train(historical_data)

    save_models()  # Save to disk
```

### 2. Human-in-the-Loop

```python
def classify_with_review(invoice):
    """
    Classification with flagging for review
    """
    prediction = expense_classifier.predict(invoice)

    if prediction['confidence'] < 0.8:
        # Low confidence -> human review
        task = create_review_task(
            invoice=invoice,
            suggested_category=prediction['category'],
            confidence=prediction['confidence'],
            alternatives=prediction['alternatives']
        )
        return {
            'category': None,  # Wait for review
            'status': 'PENDING_REVIEW',
            'task_id': task.id
        }

    # High confidence -> auto-classify
    return {
        'category': prediction['category'],
        'status': 'AUTO_CLASSIFIED',
        'confidence': prediction['confidence']
    }
```

### 3. Audit Trail for AI

```python
def log_ai_decision(invoice, prediction, action):
    """
    Log AI decisions for audit
    """
    ai_audit_log.insert({
        'timestamp': datetime.now(),
        'invoice_id': invoice.id,
        'model_name': 'ExpenseClassifier',
        'model_version': '2.1',
        'prediction': prediction,
        'confidence': prediction['confidence'],
        'action_taken': action,
        'reviewed_by_human': action == 'MANUAL_REVIEW'
    })
```

---

**Final warning:** All AI features require regular monitoring, validation and supervision by qualified staff. Do not rely solely on automated decisions in tax and accounting matters.

**Reminder:** The code examples above are reference architecture. This skill does not contain trained models, ML artifacts or executable files. Implementation requires installing dependencies (sklearn, pandas, numpy) and training models on user data in their own environment.