BoMtoCost/octo_fill.py

#!/usr/bin/env python3
"""
Octo Fill – Silicon Expert
==========================
Reads the Silicon Expert export (OCTO/seout.xlsx) and fills the
"Unit Cost EUR @1000" column in every component table across every
sheet/tab of every BoM file in the BoM/ folder.

Silicon Expert column mapping:
  Manufacturer → UPLOADED MFG
  MPN          → UPLOADED PART
  Price        → BUDGETARY PRICES column, parses "Min X & Avg Y" → uses Avg (EUR)

Matching strategy:
  1. Exact match on both Uploaded Mfg + Uploaded Part  (preferred)
  2. Fallback: match on Uploaded Part alone

Where a part appears more than once, the lowest price is used.
Cells that already contain a value are left untouched.

Usage:
    python octo_fill.py
"""

from __future__ import annotations

import sys
import logging
from pathlib import Path
from typing import Optional

import re
import openpyxl
from openpyxl.cell.cell import MergedCell

# ── Patch openpyxl for newer Excel attribute it doesn't know about ─────────────
from openpyxl.worksheet.dimensions import SheetFormatProperties as _SFP
_sfp_orig = _SFP.__init__
def _sfp_patched(self, **kw):
    kw.pop("defaultColWidthPt", None)
    _sfp_orig(self, **kw)
_SFP.__init__ = _sfp_patched
# ──────────────────────────────────────────────────────────────────────────────

BOM_DIR    = Path("BoM")
OCTO_DIR   = Path("OCTO")
SEOUT_FILE = OCTO_DIR / "seout.xlsx"
COST_HEADER = "Unit Cost EUR @1000"

SKIP_MPNS = {
    "", "0", "tbd", "n/a", "na", "-", "--", "---", "?", "none",
    "null", "nan", "xxx", "x", "dnf", "dnp", "do not fit",
    "do not populate", "total",
}

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)


# ── Load Silicon Expert data ───────────────────────────────────────────────────

def _find_col(headers: dict[str, int], substring: str) -> Optional[int]:
    """Return the index of the first header whose name contains substring (case-insensitive)."""
    for name, idx in headers.items():
        if substring.lower() in name.lower():
            return idx
    return None


def load_seout(path: Path) -> tuple[dict[tuple[str, str], float], dict[str, float]]:
    """
    Load seout.xlsx into lookup maps.
      exact_map – (mfg_lower, part_lower) → lowest unit price (EUR)
      mpn_map   – part_lower → lowest unit price (EUR)  [fallback]
    """
    exact_map: dict[tuple[str, str], float] = {}
    mpn_map:   dict[str, float] = {}

    wb = openpyxl.load_workbook(path, data_only=True, read_only=True)
    added = 0
    avg_col_name = None

    for sheet_name in wb.sheetnames:
        ws = wb[sheet_name]
        headers: Optional[dict[str, int]] = None

        for row in ws.iter_rows(values_only=True):
            row = list(row)
            if headers is None:
                row_lower = [str(v).strip().lower() if v is not None else "" for v in row]
                has_part = any("uploaded part" in v for v in row_lower)
                has_mfg  = any("uploaded mfg"  in v for v in row_lower)
                if has_part and has_mfg:
                    headers = {
                        str(row[i]).strip(): i
                        for i in range(len(row))
                        if row[i] is not None
                    }
                    log.info(f"  Sheet '{sheet_name}' headers: {list(headers.keys())}")
                    for h in headers:
                        if "budgetary" in h.lower() or "price" in h.lower():
                            avg_col_name = h
                            break
                continue

            if not any(row):
                continue

            mpn_col   = _find_col(headers, "uploaded part")
            mfr_col   = _find_col(headers, "uploaded mfg")
            # Silicon Expert stores prices as "Min X & Avg Y" in a BUDGETARY PRICES column
            price_col = _find_col(headers, "budgetary") or _find_col(headers, "price")

            if mpn_col is None or price_col is None:
                continue

            mpn = (
                str(row[mpn_col]).strip()
                if mpn_col < len(row) and row[mpn_col] is not None
                else ""
            )
            mfr = (
                str(row[mfr_col]).strip()
                if mfr_col is not None and mfr_col < len(row) and row[mfr_col] is not None
                else ""
            )
            price_raw = str(row[price_col]).strip() if price_col < len(row) and row[price_col] is not None else ""

            if not mpn or mpn.lower() in SKIP_MPNS:
                continue

            # Parse "Min 0.818 & Avg 1.3225562077" → extract the Avg value
            avg_match = re.search(r'Avg\s+([\d.]+)', price_raw, re.IGNORECASE)
            if not avg_match:
                continue
            try:
                price = float(avg_match.group(1))
            except ValueError:
                continue

            if price <= 0:
                continue

            key = (mfr.lower(), mpn.lower())
            if key not in exact_map or price < exact_map[key]:
                exact_map[key] = price
                added += 1

            mpn_k = mpn.lower()
            if mpn_k not in mpn_map or price < mpn_map[mpn_k]:
                mpn_map[mpn_k] = price

    wb.close()
    if not exact_map:
        log.warning(
            f"No entries loaded from {path.name}. "
            "Check that the file has columns containing 'Uploaded Part', 'Uploaded Mfg', "
            "Expected a 'BUDGETARY PRICES' column with values like 'Min X & Avg Y'."
        )
    log.info(
        f"Silicon Expert ({path.name}): {len(exact_map)} unique (mfg, part) entries "
        f"— avg price column: '{avg_col_name}'"
    )
    return exact_map, mpn_map


# ── BoM table finding ──────────────────────────────────────────────────────────

def _cell(value) -> str:
    return str(value).strip() if value is not None else ""


def _find_tables(indexed_rows: list[tuple[int, tuple]]):
    """
    Yields TableInfo dicts per component table found.
    Handles multiple tables side-by-side on the same row.
    Includes 'start_col' so the cost-column search stays within each table.
    """
    i = 0
    while i < len(indexed_rows):
        row_num, row = indexed_rows[i]
        row_str = [_cell(v) for v in row]

        mfr_cols = [c for c, v in enumerate(row_str) if v.lower() == "manufacturer"]
        mpn_cols = [c for c, v in enumerate(row_str) if v.lower() == "mpn"]

        if not mfr_cols or not mpn_cols:
            i += 1
            continue

        pairs: list[tuple[int, int]] = []
        used_mpn: set[int] = set()
        for mfr_col in mfr_cols:
            available = [c for c in mpn_cols if c not in used_mpn]
            if not available:
                break
            best_mpn = min(available, key=lambda c: abs(c - mfr_col))
            pairs.append((mfr_col, best_mpn))
            used_mpn.add(best_mpn)

        max_j        = i + 1
        new_header_j = None  # earliest row where a same-column header reappeared

        for mfr_col, mpn_col in pairs:
            data: list[tuple[int, str, str]] = []
            j = i + 1
            empty_streak = 0
            while j < len(indexed_rows):
                dr_num, dr = indexed_rows[j]
                mfr = _cell(dr[mfr_col] if mfr_col < len(dr) else None)
                mpn = _cell(dr[mpn_col] if mpn_col < len(dr) else None)

                if not mfr and not mpn:
                    empty_streak += 1
                    if empty_streak >= 3:
                        break
                    j += 1
                    continue
                empty_streak = 0

                # Same-column header detected — record it but let other pairs
                # continue reading past it so their data isn't truncated.
                if mfr.lower() == "manufacturer" and mpn.lower() == "mpn":
                    if new_header_j is None or j < new_header_j:
                        new_header_j = j
                    break

                if mpn and mpn.lower() not in SKIP_MPNS:
                    data.append((dr_num, mfr, mpn))
                j += 1

            max_j = max(max_j, j)
            yield {
                "header_row": row_num,
                "mfr_col":    mfr_col + 1,
                "mpn_col":    mpn_col + 1,
                "start_col":  min(mfr_col, mpn_col) + 1,
                "data":       data,
            }

        # Rewind to the earliest sub-table header so the outer loop can pick it
        # up, while still allowing wider tables (other columns) to have yielded
        # their full data above.
        i = new_header_j if new_header_j is not None else max_j


# ── Write back to BoM files ────────────────────────────────────────────────────

def fill_boms(
    bom_dir: Path,
    exact_map: dict[tuple[str, str], float],
    mpn_map:   dict[str, float],
) -> None:
    files = sorted(f for f in bom_dir.iterdir() if f.suffix.lower() in {".xlsx", ".xlsm"})
    if not files:
        log.error(f"No .xlsx/.xlsm files found in {bom_dir}/")
        sys.exit(1)

    total_filled  = 0
    total_skipped = 0
    total_missing = 0

    for f in files:
        log.info(f"Processing  {f.name}")
        try:
            # data_only resolves formula cells (e.g. =UPPER("Mfr")) to their values
            # for detection; the writable wb is used for writing prices.
            wb_ro = openpyxl.load_workbook(f, data_only=True, read_only=True)
            wb    = openpyxl.load_workbook(f)
        except Exception as exc:
            log.error(f"  Cannot open {f.name}: {exc}")
            continue

        for sheet_name in wb.sheetnames:
            ws = wb[sheet_name]
            indexed = [
                (i, tuple(row))
                for i, row in enumerate(wb_ro[sheet_name].iter_rows(values_only=True), start=1)
            ]

            # Reuse the same cost column for all stacked tables at the same
            # start_col on this sheet, so a second sub-table doesn't create a
            # new column one position to the right.
            sheet_cost_cols: dict[int, int] = {}

            KNOWN_COST_HEADERS = {
                COST_HEADER.lower(),
                "unit cost 1000x data",
            }

            for table in _find_tables(indexed):
                header_row = table["header_row"]
                data_rows  = [r for r, _, _ in table["data"]]
                row_range  = (
                    f" (Excel rows {data_rows[0]}–{data_rows[-1]})"
                    if data_rows else " (no data rows detected)"
                )
                log.info(
                    f"  Sheet '{sheet_name}' row {header_row}: "
                    f"table at col {table['start_col']}, {len(table['data'])} parts{row_range}"
                )

                if table["start_col"] in sheet_cost_cols:
                    # Stacked table — reuse the cost column found/created by the
                    # first table at this column position on this sheet.
                    cost_col = sheet_cost_cols[table["start_col"]]
                else:
                    cost_col  = None
                    last_used = table["start_col"]
                    max_col   = ws.max_column or 1
                    for c in range(table["start_col"], max_col + 1):
                        val = ws.cell(header_row, c).value
                        if val is not None:
                            val_str = str(val).strip()
                            if not val_str.startswith("="):
                                last_used = c
                            if val_str.lower() in KNOWN_COST_HEADERS:
                                cost_col = c
                                break

                    if cost_col is None:
                        cost_col = last_used + 1
                        while isinstance(ws.cell(header_row, cost_col), MergedCell):
                            cost_col += 1
                        ws.cell(header_row, cost_col).value = COST_HEADER

                    sheet_cost_cols[table["start_col"]] = cost_col

                log.info(f"    Cost column: {cost_col} ('{ws.cell(header_row, cost_col).value}')")

                tbl_filled = tbl_skipped = tbl_missing = 0
                for row_num, mfr, mpn in table["data"]:
                    cell = ws.cell(row_num, cost_col)
                    if isinstance(cell, MergedCell):
                        continue
                    existing   = cell.value
                    is_formula = isinstance(existing, str) and existing.startswith("=")
                    is_empty   = (
                        existing is None
                        or (isinstance(existing, str) and existing.strip() in ("", "0"))
                        or (isinstance(existing, (int, float)) and existing == 0)
                    )
                    if not is_empty and not is_formula:
                        log.debug(f"    Skip row {row_num} [{mpn}]: cell already has {repr(existing)}")
                        total_skipped += 1
                        tbl_skipped  += 1
                        continue

                    price = exact_map.get((mfr.lower(), mpn.lower()))
                    if price is None:
                        price = mpn_map.get(mpn.lower())
                        if price is not None:
                            log.debug(f"  MPN-only match: {mpn} (mfr '{mfr}' not matched)")

                    if price is not None:
                        cell.value         = price
                        cell.number_format = "0.000000"
                        total_filled += 1
                        tbl_filled   += 1
                    else:
                        total_missing += 1
                        tbl_missing   += 1
                        log.debug(f"    No match: [{mfr}]  [{mpn}]")

                log.info(
                    f"    → filled {tbl_filled}, skipped {tbl_skipped}, "
                    f"no match {tbl_missing}"
                )

        wb_ro.close()
        try:
            wb.save(f)
            log.info(f"  Saved  {f.name}")
        except PermissionError:
            log.error(f"  Cannot save {f.name} – close it in Excel first.")
        except Exception as exc:
            log.error(f"  Save failed for {f.name}: {exc}")

    log.info(
        f"Done – filled: {total_filled}, "
        f"already populated (skipped): {total_skipped}, "
        f"no match in Silicon Expert: {total_missing}"
    )


# ── Main ───────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    for p in (BOM_DIR, OCTO_DIR):
        if not p.exists():
            log.error(f"Not found: {p}")
            sys.exit(1)

    if not SEOUT_FILE.exists():
        log.error(f"Silicon Expert export not found: {SEOUT_FILE}")
        sys.exit(1)

    exact_map, mpn_map = load_seout(SEOUT_FILE)
    fill_boms(BOM_DIR, exact_map, mpn_map)