From c6dc3455ac0d51be0f6e576a3e581c6774e5406d Mon Sep 17 00:00:00 2001 From: Bobby Abellana Date: Tue, 11 Feb 2025 10:22:03 -0800 Subject: [PATCH] Previous Work --- src/main.py | 184 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 src/main.py diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..a393fb7 --- /dev/null +++ b/src/main.py @@ -0,0 +1,184 @@ +import os +import logging +import warnings +from tqdm import tqdm +from openpyxl import Workbook, load_workbook +import msoffcrypto +from io import BytesIO +from copy import copy, deepcopy +import zipfile +import re + +warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl.reader.workbook') + +PROTECTION_PATTERNS = [ + r"]*(?:/>|>.*?)", + r"]*(?:/>|>.*?)", + r"]*(?:/>|>.*?)", + r"]*(?:/>|>.*?)", + r"]*(?:/>|>.*?)", +] + +def setup_logging(): + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[ + logging.StreamHandler(), + ] + ) + +def load_workbook_with_possible_passwords(filepath, passwords, keep_vba=False, data_only=False): + try: + wb = load_workbook(filename=filepath, keep_vba=keep_vba, data_only=data_only) + logging.info(f"Successfully loaded workbook: {filepath}") + return wb + except Exception: + logging.warning(f"Failed to load workbook normally, trying with passwords for: {filepath}") + + for pwd in passwords: + try: + decrypted = BytesIO() + with open(filepath, 'rb') as f: + office_file = msoffcrypto.OfficeFile(f) + office_file.load_key(password=pwd.strip()) + office_file.decrypt(decrypted) + + decrypted.seek(0) + wb = load_workbook(filename=decrypted, keep_vba=keep_vba, data_only=data_only) + logging.info(f"Successfully decrypted '{filepath}' with password: '{pwd.strip()}'") + return wb + except Exception: + logging.debug(f"Password '{pwd.strip()}' did not work for '{filepath}'") + continue + + raise ValueError(f"None of the provided passwords worked for '{filepath}'") + +def copy_excel_file(source_path, destination_path, passwords): + logging.info(f"Processing Excel file: {source_path}") + is_xlsm = source_path.lower().endswith('.xlsm') + source_wb = load_workbook_with_possible_passwords( + filepath=source_path, + passwords=passwords, + keep_vba=is_xlsm, + data_only=False + ) + + dest_wb = Workbook() + if len(dest_wb.sheetnames) == 1 and dest_wb.active.title == 'Sheet': + dest_wb.remove(dest_wb.active) + + for sheet_name in source_wb.sheetnames: + logging.debug(f"Copying sheet: {sheet_name}") + source_sheet = source_wb[sheet_name] + dest_sheet = dest_wb.create_sheet(title=sheet_name) + + for row in source_sheet.iter_rows(): + for cell in row: + dest_cell = dest_sheet.cell(row=cell.row, column=cell.column) + dest_cell.value = cell.value + if cell.has_style: + dest_cell.font = copy(cell.font) + dest_cell.border = copy(cell.border) + dest_cell.fill = copy(cell.fill) + dest_cell.number_format = cell.number_format + dest_cell.protection = copy(cell.protection) + dest_cell.alignment = copy(cell.alignment) + + os.makedirs(os.path.dirname(destination_path), exist_ok=True) + logging.debug(f"Destination directory ensured: {os.path.dirname(destination_path)}") + + dest_wb.save(destination_path) + logging.info(f"Saved copied file to: {destination_path}") + dest_wb.close() + source_wb.close() + +def remove_all_protection_tags(docx_path, output_path): + logging.info(f"Processing Word document: {docx_path}") + with zipfile.ZipFile(docx_path, 'r') as zip_in: + file_list = zip_in.namelist() + + with zipfile.ZipFile(output_path, 'w', compression=zipfile.ZIP_DEFLATED) as zip_out: + for item in tqdm(file_list, desc="Processing XML files"): + data = zip_in.read(item) + + if item.lower().endswith('.xml'): + text = data.decode('utf-8', errors='ignore') + for pattern in PROTECTION_PATTERNS: + text = re.sub(pattern, "", text, flags=re.DOTALL) + data = text.encode('utf-8') + + zip_out.writestr(item, data) + os.makedirs(os.path.dirname(output_path), exist_ok=True) + logging.debug(f"Output directory ensured: {os.path.dirname(output_path)}") + logging.info(f"Saved cleaned file to: {output_path}") + +def main(): + setup_logging() + print("\nChoose the file type to process:") + print("1. Excel files") + print("2. Word documents") + choice = input("Enter your choice (1 or 2): ").strip() + + if choice == '1': + source_dir = input("Enter the source folder with Excel files: ").strip() + dest_dir = input("Enter the destination folder for copied files: ").strip() + os.makedirs(dest_dir, exist_ok=True) + + password_option = input("Choose password option (file/single/none): ").strip().lower() + passwords = [] + + if password_option == 'file': + password_file = input("Enter the path to the password file: ").strip() + with open(password_file, 'r', encoding='utf-8') as pf: + passwords = [line.strip() for line in pf if line.strip()] + elif password_option == 'single': + single_password = input("Enter the password: ").strip() + passwords = [single_password] + + files = [ + os.path.join(root, file) + for root, _, files in os.walk(source_dir) + for file in files + if file.lower().endswith(('.xlsx', '.xlsm')) + ] + + for source_path in tqdm(files, desc="Copying Excel Files"): + relative_path = os.path.relpath(os.path.dirname(source_path), source_dir) + dest_path = os.path.join(dest_dir, relative_path, os.path.basename(source_path)) + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + logging.debug(f"Ensured destination path: {dest_path}") + + try: + copy_excel_file(source_path, dest_path, passwords) + except Exception as e: + logging.error(f"Failed to copy {source_path}: {e}") + + elif choice == '2': + source_dir = input("Enter the source folder with Word files: ").strip() + dest_dir = input("Enter the destination folder for cleaned files: ").strip() + os.makedirs(dest_dir, exist_ok=True) + + files = [ + os.path.join(root, file) + for root, _, files in os.walk(source_dir) + for file in files + if file.lower().endswith(('.docx', '.docm')) + ] + + for source_path in tqdm(files, desc="Removing Word Protections"): + relative_path = os.path.relpath(os.path.dirname(source_path), source_dir) + dest_path = os.path.join(dest_dir, relative_path, os.path.basename(source_path)) + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + logging.debug(f"Ensured destination path: {dest_path}") + + try: + remove_all_protection_tags(source_path, dest_path) + except Exception as e: + logging.error(f"Failed to clean {source_path}: {e}") + + else: + logging.error("Invalid choice. Please restart the script and choose a valid option.") + +if __name__ == "__main__": + main()