BoMtoCost/bom_extract.py

#!/usr/bin/env python3
"""
BoM Extractor
=============
Reads every .xlsx / .xlsm file from the BoM/ folder, extracts all
(Manufacturer, MPN) pairs from every table in every sheet, deduplicates,
and writes the result to bom_parts.xlsx.

Usage:
    python bom_extract.py
"""

from __future__ import annotations

import sys
import logging
from pathlib import Path

import openpyxl
import pandas as pd

BOM_DIR     = Path("BoM")
OUTPUT_DIR  = Path("OUTPUT")
CHUNK_SIZE  = 500

SKIP_MPNS = {
    "", "tbd", "n/a", "na", "-", "--", "---", "?", "none",
    "null", "nan", "xxx", "x", "dnf", "dnp", "do not fit",
    "do not populate",
}

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)


def _cell(value) -> str:
    return str(value).strip() if value is not None else ""


def _find_tables(indexed_rows: list[tuple[int, tuple]]) -> list[tuple[str, str]]:
    """
    Return all (manufacturer, mpn) pairs found across every table in the row list.
    Handles multiple tables side-by-side on the same header row.
    """
    parts: list[tuple[str, str]] = []
    i = 0
    while i < len(indexed_rows):
        _, row = indexed_rows[i]
        row_str = [_cell(v) for v in row]

        mfr_cols = [c for c, v in enumerate(row_str) if v.lower() == "manufacturer"]
        mpn_cols = [c for c, v in enumerate(row_str) if v.lower() == "mpn"]

        if not mfr_cols or not mpn_cols:
            i += 1
            continue

        # Pair each mfr_col with its nearest unpaired mpn_col
        pairs: list[tuple[int, int]] = []
        used_mpn: set[int] = set()
        for mfr_col in mfr_cols:
            available = [c for c in mpn_cols if c not in used_mpn]
            if not available:
                break
            best_mpn = min(available, key=lambda c: abs(c - mfr_col))
            pairs.append((mfr_col, best_mpn))
            used_mpn.add(best_mpn)

        max_j = i + 1
        for mfr_col, mpn_col in pairs:
            j = i + 1
            empty_streak = 0
            while j < len(indexed_rows):
                _, dr = indexed_rows[j]
                mfr = _cell(dr[mfr_col] if mfr_col < len(dr) else None)
                mpn = _cell(dr[mpn_col] if mpn_col < len(dr) else None)

                if not mfr and not mpn:
                    empty_streak += 1
                    if empty_streak >= 3:
                        break
                    j += 1
                    continue
                empty_streak = 0

                if mfr.lower() == "manufacturer" and mpn.lower() == "mpn":
                    break

                if mpn and mpn.lower() not in SKIP_MPNS:
                    parts.append((mfr, mpn))
                j += 1

            max_j = max(max_j, j)

        i = max_j
    return parts


def extract(bom_dir: Path) -> list[tuple[str, str]]:
    files = sorted(f for f in bom_dir.iterdir() if f.suffix.lower() in {".xlsx", ".xlsm"})

    if not files:
        log.error(f"No .xlsx/.xlsm files found in {bom_dir}/")
        sys.exit(1)

    seen:  set[tuple[str, str]] = set()
    parts: list[tuple[str, str]] = []

    for f in files:
        log.info(f"Reading  {f.name}")
        try:
            wb = openpyxl.load_workbook(f, data_only=True, read_only=True)
            for sheet_name in wb.sheetnames:
                ws = wb[sheet_name]
                indexed = [
                    (i, tuple(row))
                    for i, row in enumerate(ws.iter_rows(values_only=True), start=1)
                ]
                found = _find_tables(indexed)
                new = [(mfr, mpn) for mfr, mpn in found
                       if (mfr.lower(), mpn.lower()) not in seen]
                for mfr, mpn in new:
                    seen.add((mfr.lower(), mpn.lower()))
                    parts.append((mfr, mpn))
                if found:
                    log.info(f"  Sheet '{sheet_name}': {len(found)} rows, {len(new)} new unique")
            wb.close()
        except Exception as exc:
            log.error(f"  Failed to read {f.name}: {exc}")

    log.info(f"Total unique parts: {len(parts)}")
    return parts


def write_chunks(parts: list[tuple[str, str]], output_dir: Path) -> None:
    output_dir.mkdir(exist_ok=True)

    df = pd.DataFrame(parts, columns=["Manufacturer", "MPN"])
    df.sort_values(["Manufacturer", "MPN"], inplace=True, ignore_index=True)

    total   = len(df)
    n_files = (total + CHUNK_SIZE - 1) // CHUNK_SIZE

    for idx in range(n_files):
        chunk = df.iloc[idx * CHUNK_SIZE : (idx + 1) * CHUNK_SIZE]
        out   = output_dir / f"bom_parts_{idx + 1}_of_{n_files}.xlsx"

        with pd.ExcelWriter(out, engine="openpyxl") as writer:
            chunk.to_excel(writer, index=False, sheet_name="Parts")
            ws = writer.sheets["Parts"]
            for col in ws.columns:
                width = max(len(str(cell.value or "")) for cell in col)
                ws.column_dimensions[col[0].column_letter].width = min(width + 3, 60)

        log.info(f"  Written → {out}  ({len(chunk)} parts)")

    log.info(f"Done – {total} unique parts across {n_files} file(s) in {output_dir}/")


if __name__ == "__main__":
    if not BOM_DIR.exists():
        log.error(f"BoM directory '{BOM_DIR}' not found.")
        sys.exit(1)

    parts = extract(BOM_DIR)
    write_chunks(parts, OUTPUT_DIR)