78 lines
2.4 KiB
Python
78 lines
2.4 KiB
Python
"""
|
|
Detect horizontal table borders in a scanned PDF page PNG
|
|
and extract row bounding boxes.
|
|
|
|
Usage:
|
|
python detect_table_rows.py <page_png> [--min-width 0.7] [--debug]
|
|
|
|
Prints detected row y-ranges as normalized (0-1) coordinates.
|
|
"""
|
|
import sys
|
|
import argparse
|
|
import numpy as np
|
|
from PIL import Image
|
|
|
|
|
|
def detect_rows(page_png, min_width_frac=0.7, debug=False):
|
|
img = Image.open(page_png).convert('L') # grayscale
|
|
arr = np.array(img)
|
|
h, w = arr.shape
|
|
|
|
# Binarize: dark pixels (potential lines) = True
|
|
dark = arr < 128
|
|
# Count dark pixels per row
|
|
row_dark_count = dark.sum(axis=1)
|
|
min_dark = int(min_width_frac * w)
|
|
|
|
# Find rows that are mostly dark (horizontal lines)
|
|
is_line = row_dark_count > min_dark
|
|
|
|
# Group consecutive line pixels into bands
|
|
line_bands = []
|
|
in_band = False
|
|
band_start = 0
|
|
for y in range(h):
|
|
if is_line[y] and not in_band:
|
|
in_band = True
|
|
band_start = y
|
|
elif not is_line[y] and in_band:
|
|
in_band = False
|
|
band_end = y
|
|
line_bands.append((band_start, band_end))
|
|
if in_band:
|
|
line_bands.append((band_start, h))
|
|
|
|
if not line_bands:
|
|
print("No table lines detected. Try reducing --min-width.", file=sys.stderr)
|
|
return []
|
|
|
|
# Extract row y-ranges between consecutive line bands
|
|
rows = []
|
|
for i in range(len(line_bands) - 1):
|
|
y_top = line_bands[i][1] # bottom of upper border
|
|
y_bot = line_bands[i + 1][0] # top of lower border
|
|
if y_bot - y_top > 5: # skip tiny gaps
|
|
rows.append((y_top / h, y_bot / h))
|
|
|
|
if debug:
|
|
print(f"Detected {len(line_bands)} line bands:")
|
|
for b in line_bands:
|
|
print(f" pixels {b[0]}-{b[1]} (y={b[0]/h:.3f}-{b[1]/h:.3f})")
|
|
print(f"\nDetected {len(rows)} content rows:")
|
|
for i, (y0, y1) in enumerate(rows):
|
|
print(f" row {i}: y={y0:.3f}-{y1:.3f} (pixels {int(y0*h)}-{int(y1*h)}, height={int((y1-y0)*h)}px)")
|
|
|
|
return rows
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('page_png')
|
|
parser.add_argument('--min-width', type=float, default=0.7)
|
|
parser.add_argument('--debug', action='store_true')
|
|
args = parser.parse_args()
|
|
rows = detect_rows(args.page_png, min_width_frac=args.min_width, debug=args.debug)
|
|
if not args.debug:
|
|
for i, (y0, y1) in enumerate(rows):
|
|
print(f"row {i}: {y0:.4f} - {y1:.4f}")
|