Updates

2026-04-30 12:39:48 +01:00
parent bc2791e6fa
commit 70b2b6acc3
6 changed files with 431 additions and 67 deletions
--- a/bom_extract.py
+++ b/bom_extract.py
@@ -20,7 +20,8 @@ import openpyxl
 import pandas as pd

 BOM_DIR     = Path("BoM")
-OUTPUT_FILE = Path("bom_parts.xlsx")
+OUTPUT_DIR  = Path("OUTPUT")
+CHUNK_SIZE  = 500

 SKIP_MPNS = {
    "", "tbd", "n/a", "na", "-", "--", "---", "?", "none",
@@ -41,43 +42,61 @@ def _cell(value) -> str:


 def _find_tables(indexed_rows: list[tuple[int, tuple]]) -> list[tuple[str, str]]:
-    """Return all (manufacturer, mpn) pairs found across every table in the row list."""
+    """
+    Return all (manufacturer, mpn) pairs found across every table in the row list.
+    Handles multiple tables side-by-side on the same header row.
+    """
    parts: list[tuple[str, str]] = []
    i = 0
    while i < len(indexed_rows):
        _, row = indexed_rows[i]
        row_str = [_cell(v) for v in row]

-        mfr_col = next((c for c, v in enumerate(row_str) if v.lower() == "manufacturer"), None)
-        mpn_col = next((c for c, v in enumerate(row_str) if v.lower() == "mpn"), None)
+        mfr_cols = [c for c, v in enumerate(row_str) if v.lower() == "manufacturer"]
+        mpn_cols = [c for c, v in enumerate(row_str) if v.lower() == "mpn"]

-        if mfr_col is None or mpn_col is None:
+        if not mfr_cols or not mpn_cols:
            i += 1
            continue

-        j = i + 1
-        empty_streak = 0
-        while j < len(indexed_rows):
-            _, dr = indexed_rows[j]
-            mfr = _cell(dr[mfr_col] if mfr_col < len(dr) else None)
-            mpn = _cell(dr[mpn_col] if mpn_col < len(dr) else None)
-
-            if not mfr and not mpn:
-                empty_streak += 1
-                if empty_streak >= 3:
-                    break
-                j += 1
-                continue
-            empty_streak = 0
-
-            if mfr.lower() == "manufacturer" and mpn.lower() == "mpn":
+        # Pair each mfr_col with its nearest unpaired mpn_col
+        pairs: list[tuple[int, int]] = []
+        used_mpn: set[int] = set()
+        for mfr_col in mfr_cols:
+            available = [c for c in mpn_cols if c not in used_mpn]
+            if not available:
                break
+            best_mpn = min(available, key=lambda c: abs(c - mfr_col))
+            pairs.append((mfr_col, best_mpn))
+            used_mpn.add(best_mpn)

-            if mpn and mpn.lower() not in SKIP_MPNS:
-                parts.append((mfr, mpn))
-            j += 1
+        max_j = i + 1
+        for mfr_col, mpn_col in pairs:
+            j = i + 1
+            empty_streak = 0
+            while j < len(indexed_rows):
+                _, dr = indexed_rows[j]
+                mfr = _cell(dr[mfr_col] if mfr_col < len(dr) else None)
+                mpn = _cell(dr[mpn_col] if mpn_col < len(dr) else None)

-        i = j
+                if not mfr and not mpn:
+                    empty_streak += 1
+                    if empty_streak >= 3:
+                        break
+                    j += 1
+                    continue
+                empty_streak = 0
+
+                if mfr.lower() == "manufacturer" and mpn.lower() == "mpn":
+                    break
+
+                if mpn and mpn.lower() not in SKIP_MPNS:
+                    parts.append((mfr, mpn))
+                j += 1
+
+            max_j = max(max_j, j)
+
+        i = max_j
    return parts


@@ -117,18 +136,29 @@ def extract(bom_dir: Path) -> list[tuple[str, str]]:
    return parts


-def write(parts: list[tuple[str, str]], output: Path) -> None:
+def write_chunks(parts: list[tuple[str, str]], output_dir: Path) -> None:
+    output_dir.mkdir(exist_ok=True)
+
    df = pd.DataFrame(parts, columns=["Manufacturer", "MPN"])
    df.sort_values(["Manufacturer", "MPN"], inplace=True, ignore_index=True)

-    with pd.ExcelWriter(output, engine="openpyxl") as writer:
-        df.to_excel(writer, index=False, sheet_name="Parts")
-        ws = writer.sheets["Parts"]
-        for col in ws.columns:
-            width = max(len(str(cell.value or "")) for cell in col)
-            ws.column_dimensions[col[0].column_letter].width = min(width + 3, 60)
+    total   = len(df)
+    n_files = (total + CHUNK_SIZE - 1) // CHUNK_SIZE

-    log.info(f"Written → {output}  ({len(parts)} unique parts)")
+    for idx in range(n_files):
+        chunk = df.iloc[idx * CHUNK_SIZE : (idx + 1) * CHUNK_SIZE]
+        out   = output_dir / f"bom_parts_{idx + 1}_of_{n_files}.xlsx"
+
+        with pd.ExcelWriter(out, engine="openpyxl") as writer:
+            chunk.to_excel(writer, index=False, sheet_name="Parts")
+            ws = writer.sheets["Parts"]
+            for col in ws.columns:
+                width = max(len(str(cell.value or "")) for cell in col)
+                ws.column_dimensions[col[0].column_letter].width = min(width + 3, 60)
+
+        log.info(f"  Written → {out}  ({len(chunk)} parts)")
+
+    log.info(f"Done – {total} unique parts across {n_files} file(s) in {output_dir}/")


 if __name__ == "__main__":
@@ -137,4 +167,4 @@ if __name__ == "__main__":
        sys.exit(1)

    parts = extract(BOM_DIR)
-    write(parts, OUTPUT_FILE)
+    write_chunks(parts, OUTPUT_DIR)