Updates
This commit is contained in:
@@ -20,7 +20,8 @@ import openpyxl
|
||||
import pandas as pd
|
||||
|
||||
BOM_DIR = Path("BoM")
|
||||
OUTPUT_FILE = Path("bom_parts.xlsx")
|
||||
OUTPUT_DIR = Path("OUTPUT")
|
||||
CHUNK_SIZE = 500
|
||||
|
||||
SKIP_MPNS = {
|
||||
"", "tbd", "n/a", "na", "-", "--", "---", "?", "none",
|
||||
@@ -41,43 +42,61 @@ def _cell(value) -> str:
|
||||
|
||||
|
||||
def _find_tables(indexed_rows: list[tuple[int, tuple]]) -> list[tuple[str, str]]:
|
||||
"""Return all (manufacturer, mpn) pairs found across every table in the row list."""
|
||||
"""
|
||||
Return all (manufacturer, mpn) pairs found across every table in the row list.
|
||||
Handles multiple tables side-by-side on the same header row.
|
||||
"""
|
||||
parts: list[tuple[str, str]] = []
|
||||
i = 0
|
||||
while i < len(indexed_rows):
|
||||
_, row = indexed_rows[i]
|
||||
row_str = [_cell(v) for v in row]
|
||||
|
||||
mfr_col = next((c for c, v in enumerate(row_str) if v.lower() == "manufacturer"), None)
|
||||
mpn_col = next((c for c, v in enumerate(row_str) if v.lower() == "mpn"), None)
|
||||
mfr_cols = [c for c, v in enumerate(row_str) if v.lower() == "manufacturer"]
|
||||
mpn_cols = [c for c, v in enumerate(row_str) if v.lower() == "mpn"]
|
||||
|
||||
if mfr_col is None or mpn_col is None:
|
||||
if not mfr_cols or not mpn_cols:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
j = i + 1
|
||||
empty_streak = 0
|
||||
while j < len(indexed_rows):
|
||||
_, dr = indexed_rows[j]
|
||||
mfr = _cell(dr[mfr_col] if mfr_col < len(dr) else None)
|
||||
mpn = _cell(dr[mpn_col] if mpn_col < len(dr) else None)
|
||||
|
||||
if not mfr and not mpn:
|
||||
empty_streak += 1
|
||||
if empty_streak >= 3:
|
||||
break
|
||||
j += 1
|
||||
continue
|
||||
empty_streak = 0
|
||||
|
||||
if mfr.lower() == "manufacturer" and mpn.lower() == "mpn":
|
||||
# Pair each mfr_col with its nearest unpaired mpn_col
|
||||
pairs: list[tuple[int, int]] = []
|
||||
used_mpn: set[int] = set()
|
||||
for mfr_col in mfr_cols:
|
||||
available = [c for c in mpn_cols if c not in used_mpn]
|
||||
if not available:
|
||||
break
|
||||
best_mpn = min(available, key=lambda c: abs(c - mfr_col))
|
||||
pairs.append((mfr_col, best_mpn))
|
||||
used_mpn.add(best_mpn)
|
||||
|
||||
if mpn and mpn.lower() not in SKIP_MPNS:
|
||||
parts.append((mfr, mpn))
|
||||
j += 1
|
||||
max_j = i + 1
|
||||
for mfr_col, mpn_col in pairs:
|
||||
j = i + 1
|
||||
empty_streak = 0
|
||||
while j < len(indexed_rows):
|
||||
_, dr = indexed_rows[j]
|
||||
mfr = _cell(dr[mfr_col] if mfr_col < len(dr) else None)
|
||||
mpn = _cell(dr[mpn_col] if mpn_col < len(dr) else None)
|
||||
|
||||
i = j
|
||||
if not mfr and not mpn:
|
||||
empty_streak += 1
|
||||
if empty_streak >= 3:
|
||||
break
|
||||
j += 1
|
||||
continue
|
||||
empty_streak = 0
|
||||
|
||||
if mfr.lower() == "manufacturer" and mpn.lower() == "mpn":
|
||||
break
|
||||
|
||||
if mpn and mpn.lower() not in SKIP_MPNS:
|
||||
parts.append((mfr, mpn))
|
||||
j += 1
|
||||
|
||||
max_j = max(max_j, j)
|
||||
|
||||
i = max_j
|
||||
return parts
|
||||
|
||||
|
||||
@@ -117,18 +136,29 @@ def extract(bom_dir: Path) -> list[tuple[str, str]]:
|
||||
return parts
|
||||
|
||||
|
||||
def write(parts: list[tuple[str, str]], output: Path) -> None:
|
||||
def write_chunks(parts: list[tuple[str, str]], output_dir: Path) -> None:
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
df = pd.DataFrame(parts, columns=["Manufacturer", "MPN"])
|
||||
df.sort_values(["Manufacturer", "MPN"], inplace=True, ignore_index=True)
|
||||
|
||||
with pd.ExcelWriter(output, engine="openpyxl") as writer:
|
||||
df.to_excel(writer, index=False, sheet_name="Parts")
|
||||
ws = writer.sheets["Parts"]
|
||||
for col in ws.columns:
|
||||
width = max(len(str(cell.value or "")) for cell in col)
|
||||
ws.column_dimensions[col[0].column_letter].width = min(width + 3, 60)
|
||||
total = len(df)
|
||||
n_files = (total + CHUNK_SIZE - 1) // CHUNK_SIZE
|
||||
|
||||
log.info(f"Written → {output} ({len(parts)} unique parts)")
|
||||
for idx in range(n_files):
|
||||
chunk = df.iloc[idx * CHUNK_SIZE : (idx + 1) * CHUNK_SIZE]
|
||||
out = output_dir / f"bom_parts_{idx + 1}_of_{n_files}.xlsx"
|
||||
|
||||
with pd.ExcelWriter(out, engine="openpyxl") as writer:
|
||||
chunk.to_excel(writer, index=False, sheet_name="Parts")
|
||||
ws = writer.sheets["Parts"]
|
||||
for col in ws.columns:
|
||||
width = max(len(str(cell.value or "")) for cell in col)
|
||||
ws.column_dimensions[col[0].column_letter].width = min(width + 3, 60)
|
||||
|
||||
log.info(f" Written → {out} ({len(chunk)} parts)")
|
||||
|
||||
log.info(f"Done – {total} unique parts across {n_files} file(s) in {output_dir}/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -137,4 +167,4 @@ if __name__ == "__main__":
|
||||
sys.exit(1)
|
||||
|
||||
parts = extract(BOM_DIR)
|
||||
write(parts, OUTPUT_FILE)
|
||||
write_chunks(parts, OUTPUT_DIR)
|
||||
|
||||
Reference in New Issue
Block a user