This commit is contained in:
David Rice
2026-04-30 12:39:48 +01:00
parent bc2791e6fa
commit 70b2b6acc3
6 changed files with 431 additions and 67 deletions

View File

@@ -20,7 +20,8 @@ import openpyxl
import pandas as pd
BOM_DIR = Path("BoM")
OUTPUT_FILE = Path("bom_parts.xlsx")
OUTPUT_DIR = Path("OUTPUT")
CHUNK_SIZE = 500
SKIP_MPNS = {
"", "tbd", "n/a", "na", "-", "--", "---", "?", "none",
@@ -41,43 +42,61 @@ def _cell(value) -> str:
def _find_tables(indexed_rows: list[tuple[int, tuple]]) -> list[tuple[str, str]]:
"""Return all (manufacturer, mpn) pairs found across every table in the row list."""
"""
Return all (manufacturer, mpn) pairs found across every table in the row list.
Handles multiple tables side-by-side on the same header row.
"""
parts: list[tuple[str, str]] = []
i = 0
while i < len(indexed_rows):
_, row = indexed_rows[i]
row_str = [_cell(v) for v in row]
mfr_col = next((c for c, v in enumerate(row_str) if v.lower() == "manufacturer"), None)
mpn_col = next((c for c, v in enumerate(row_str) if v.lower() == "mpn"), None)
mfr_cols = [c for c, v in enumerate(row_str) if v.lower() == "manufacturer"]
mpn_cols = [c for c, v in enumerate(row_str) if v.lower() == "mpn"]
if mfr_col is None or mpn_col is None:
if not mfr_cols or not mpn_cols:
i += 1
continue
j = i + 1
empty_streak = 0
while j < len(indexed_rows):
_, dr = indexed_rows[j]
mfr = _cell(dr[mfr_col] if mfr_col < len(dr) else None)
mpn = _cell(dr[mpn_col] if mpn_col < len(dr) else None)
if not mfr and not mpn:
empty_streak += 1
if empty_streak >= 3:
break
j += 1
continue
empty_streak = 0
if mfr.lower() == "manufacturer" and mpn.lower() == "mpn":
# Pair each mfr_col with its nearest unpaired mpn_col
pairs: list[tuple[int, int]] = []
used_mpn: set[int] = set()
for mfr_col in mfr_cols:
available = [c for c in mpn_cols if c not in used_mpn]
if not available:
break
best_mpn = min(available, key=lambda c: abs(c - mfr_col))
pairs.append((mfr_col, best_mpn))
used_mpn.add(best_mpn)
if mpn and mpn.lower() not in SKIP_MPNS:
parts.append((mfr, mpn))
j += 1
max_j = i + 1
for mfr_col, mpn_col in pairs:
j = i + 1
empty_streak = 0
while j < len(indexed_rows):
_, dr = indexed_rows[j]
mfr = _cell(dr[mfr_col] if mfr_col < len(dr) else None)
mpn = _cell(dr[mpn_col] if mpn_col < len(dr) else None)
i = j
if not mfr and not mpn:
empty_streak += 1
if empty_streak >= 3:
break
j += 1
continue
empty_streak = 0
if mfr.lower() == "manufacturer" and mpn.lower() == "mpn":
break
if mpn and mpn.lower() not in SKIP_MPNS:
parts.append((mfr, mpn))
j += 1
max_j = max(max_j, j)
i = max_j
return parts
@@ -117,18 +136,29 @@ def extract(bom_dir: Path) -> list[tuple[str, str]]:
return parts
def write(parts: list[tuple[str, str]], output: Path) -> None:
def write_chunks(parts: list[tuple[str, str]], output_dir: Path) -> None:
output_dir.mkdir(exist_ok=True)
df = pd.DataFrame(parts, columns=["Manufacturer", "MPN"])
df.sort_values(["Manufacturer", "MPN"], inplace=True, ignore_index=True)
with pd.ExcelWriter(output, engine="openpyxl") as writer:
df.to_excel(writer, index=False, sheet_name="Parts")
ws = writer.sheets["Parts"]
for col in ws.columns:
width = max(len(str(cell.value or "")) for cell in col)
ws.column_dimensions[col[0].column_letter].width = min(width + 3, 60)
total = len(df)
n_files = (total + CHUNK_SIZE - 1) // CHUNK_SIZE
log.info(f"Written → {output} ({len(parts)} unique parts)")
for idx in range(n_files):
chunk = df.iloc[idx * CHUNK_SIZE : (idx + 1) * CHUNK_SIZE]
out = output_dir / f"bom_parts_{idx + 1}_of_{n_files}.xlsx"
with pd.ExcelWriter(out, engine="openpyxl") as writer:
chunk.to_excel(writer, index=False, sheet_name="Parts")
ws = writer.sheets["Parts"]
for col in ws.columns:
width = max(len(str(cell.value or "")) for cell in col)
ws.column_dimensions[col[0].column_letter].width = min(width + 3, 60)
log.info(f" Written → {out} ({len(chunk)} parts)")
log.info(f"Done {total} unique parts across {n_files} file(s) in {output_dir}/")
if __name__ == "__main__":
@@ -137,4 +167,4 @@ if __name__ == "__main__":
sys.exit(1)
parts = extract(BOM_DIR)
write(parts, OUTPUT_FILE)
write_chunks(parts, OUTPUT_DIR)