diff --git a/bom_extract.py b/bom_extract.py new file mode 100644 index 0000000..b0a7882 --- /dev/null +++ b/bom_extract.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +""" +BoM Extractor +============= +Reads every .xlsx / .xlsm file from the BoM/ folder, extracts all +(Manufacturer, MPN) pairs from every table in every sheet, deduplicates, +and writes the result to bom_parts.xlsx. + +Usage: + python bom_extract.py +""" + +from __future__ import annotations + +import sys +import logging +from pathlib import Path + +import openpyxl +import pandas as pd + +BOM_DIR = Path("BoM") +OUTPUT_FILE = Path("bom_parts.xlsx") + +SKIP_MPNS = { + "", "tbd", "n/a", "na", "-", "--", "---", "?", "none", + "null", "nan", "xxx", "x", "dnf", "dnp", "do not fit", + "do not populate", +} + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-8s %(message)s", + datefmt="%H:%M:%S", +) +log = logging.getLogger(__name__) + + +def _cell(value) -> str: + return str(value).strip() if value is not None else "" + + +def _find_tables(indexed_rows: list[tuple[int, tuple]]) -> list[tuple[str, str]]: + """Return all (manufacturer, mpn) pairs found across every table in the row list.""" + parts: list[tuple[str, str]] = [] + i = 0 + while i < len(indexed_rows): + _, row = indexed_rows[i] + row_str = [_cell(v) for v in row] + + mfr_col = next((c for c, v in enumerate(row_str) if v.lower() == "manufacturer"), None) + mpn_col = next((c for c, v in enumerate(row_str) if v.lower() == "mpn"), None) + + if mfr_col is None or mpn_col is None: + i += 1 + continue + + j = i + 1 + empty_streak = 0 + while j < len(indexed_rows): + _, dr = indexed_rows[j] + mfr = _cell(dr[mfr_col] if mfr_col < len(dr) else None) + mpn = _cell(dr[mpn_col] if mpn_col < len(dr) else None) + + if not mfr and not mpn: + empty_streak += 1 + if empty_streak >= 3: + break + j += 1 + continue + empty_streak = 0 + + if mfr.lower() == "manufacturer" and mpn.lower() == "mpn": + break + + if mpn and mpn.lower() not in SKIP_MPNS: + parts.append((mfr, mpn)) + j += 1 + + i = j + return parts + + +def extract(bom_dir: Path) -> list[tuple[str, str]]: + files = sorted(f for f in bom_dir.iterdir() if f.suffix.lower() in {".xlsx", ".xlsm"}) + + if not files: + log.error(f"No .xlsx/.xlsm files found in {bom_dir}/") + sys.exit(1) + + seen: set[tuple[str, str]] = set() + parts: list[tuple[str, str]] = [] + + for f in files: + log.info(f"Reading {f.name}") + try: + wb = openpyxl.load_workbook(f, data_only=True, read_only=True) + for sheet_name in wb.sheetnames: + ws = wb[sheet_name] + indexed = [ + (i, tuple(row)) + for i, row in enumerate(ws.iter_rows(values_only=True), start=1) + ] + found = _find_tables(indexed) + new = [(mfr, mpn) for mfr, mpn in found + if (mfr.lower(), mpn.lower()) not in seen] + for mfr, mpn in new: + seen.add((mfr.lower(), mpn.lower())) + parts.append((mfr, mpn)) + if found: + log.info(f" Sheet '{sheet_name}': {len(found)} rows, {len(new)} new unique") + wb.close() + except Exception as exc: + log.error(f" Failed to read {f.name}: {exc}") + + log.info(f"Total unique parts: {len(parts)}") + return parts + + +def write(parts: list[tuple[str, str]], output: Path) -> None: + df = pd.DataFrame(parts, columns=["Manufacturer", "MPN"]) + df.sort_values(["Manufacturer", "MPN"], inplace=True, ignore_index=True) + + with pd.ExcelWriter(output, engine="openpyxl") as writer: + df.to_excel(writer, index=False, sheet_name="Parts") + ws = writer.sheets["Parts"] + for col in ws.columns: + width = max(len(str(cell.value or "")) for cell in col) + ws.column_dimensions[col[0].column_letter].width = min(width + 3, 60) + + log.info(f"Written → {output} ({len(parts)} unique parts)") + + +if __name__ == "__main__": + if not BOM_DIR.exists(): + log.error(f"BoM directory '{BOM_DIR}' not found.") + sys.exit(1) + + parts = extract(BOM_DIR) + write(parts, OUTPUT_FILE) diff --git a/bom_parts.xlsx b/bom_parts.xlsx new file mode 100644 index 0000000..1002e1c Binary files /dev/null and b/bom_parts.xlsx differ