Files
Learn_System/backend/scripts/detect_table_rows.py

78 lines
2.4 KiB
Python

"""
Detect horizontal table borders in a scanned PDF page PNG
and extract row bounding boxes.
Usage:
python detect_table_rows.py <page_png> [--min-width 0.7] [--debug]
Prints detected row y-ranges as normalized (0-1) coordinates.
"""
import sys
import argparse
import numpy as np
from PIL import Image
def detect_rows(page_png, min_width_frac=0.7, debug=False):
img = Image.open(page_png).convert('L') # grayscale
arr = np.array(img)
h, w = arr.shape
# Binarize: dark pixels (potential lines) = True
dark = arr < 128
# Count dark pixels per row
row_dark_count = dark.sum(axis=1)
min_dark = int(min_width_frac * w)
# Find rows that are mostly dark (horizontal lines)
is_line = row_dark_count > min_dark
# Group consecutive line pixels into bands
line_bands = []
in_band = False
band_start = 0
for y in range(h):
if is_line[y] and not in_band:
in_band = True
band_start = y
elif not is_line[y] and in_band:
in_band = False
band_end = y
line_bands.append((band_start, band_end))
if in_band:
line_bands.append((band_start, h))
if not line_bands:
print("No table lines detected. Try reducing --min-width.", file=sys.stderr)
return []
# Extract row y-ranges between consecutive line bands
rows = []
for i in range(len(line_bands) - 1):
y_top = line_bands[i][1] # bottom of upper border
y_bot = line_bands[i + 1][0] # top of lower border
if y_bot - y_top > 5: # skip tiny gaps
rows.append((y_top / h, y_bot / h))
if debug:
print(f"Detected {len(line_bands)} line bands:")
for b in line_bands:
print(f" pixels {b[0]}-{b[1]} (y={b[0]/h:.3f}-{b[1]/h:.3f})")
print(f"\nDetected {len(rows)} content rows:")
for i, (y0, y1) in enumerate(rows):
print(f" row {i}: y={y0:.3f}-{y1:.3f} (pixels {int(y0*h)}-{int(y1*h)}, height={int((y1-y0)*h)}px)")
return rows
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('page_png')
parser.add_argument('--min-width', type=float, default=0.7)
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
rows = detect_rows(args.page_png, min_width_frac=args.min_width, debug=args.debug)
if not args.debug:
for i, (y0, y1) in enumerate(rows):
print(f"row {i}: {y0:.4f} - {y1:.4f}")