import os import re import base64 import cv2 import numpy as np from bs4 import BeautifulSoup def extract_base64_image(html_file): """Extracts base64-encoded image data from an HTML file.""" with open(html_file, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") img_tag = soup.find("img") if img_tag and 'src' in img_tag.attrs: match = re.search(r'data:image/png;base64,(.+)', img_tag['src']) if match: return match.group(1) return None def resize_image_base64(image_data, width=100): """Resizes the image to the specified width while keeping aspect ratio and returns base64.""" img_array = np.frombuffer(base64.b64decode(image_data), dtype=np.uint8) img = cv2.imdecode(img_array, cv2.IMREAD_UNCHANGED) if img is None: return None h, w = img.shape[:2] new_height = int((width / w) * h) resized_img = cv2.resize(img, (width, new_height), interpolation=cv2.INTER_AREA) _, buffer = cv2.imencode('.png', resized_img) resized_base64 = base64.b64encode(buffer).decode('utf-8') return resized_base64 def categorize_image(image_data): """Categorizes the image as 'grey', 'no image', or 'content' and returns resized image.""" if not image_data: return "no_image", None img_array = np.frombuffer(base64.b64decode(image_data), dtype=np.uint8) img = cv2.imdecode(img_array, cv2.IMREAD_GRAYSCALE) if img is None: return "no_image", None mean_value = cv2.mean(img)[0] std_dev = np.std(img) category = "grey" if mean_value > 200 and std_dev < 10 else "content" resized_data = resize_image_base64(image_data) return category, resized_data def process_html_files(input_folder, output_html="gallery.html"): """Processes all HTML files, categorizes them, and generates a gallery.""" categories = {"no_image": [], "grey": [], "content": []} for file in os.listdir(input_folder): if file.endswith(".html") and not file.startswith('report'): file_path = os.path.join(input_folder, file) image_data = extract_base64_image(file_path) category, resized_image = categorize_image(image_data) categories[category].append((file, resized_image)) with open(output_html, "w", encoding="utf-8") as f: f.write("""