import os import logging import warnings from tqdm import tqdm from openpyxl import Workbook, load_workbook import msoffcrypto from io import BytesIO from copy import copy, deepcopy import zipfile import re warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl.reader.workbook') PROTECTION_PATTERNS = [ r"]*(?:/>|>.*?)", r"]*(?:/>|>.*?)", r"]*(?:/>|>.*?)", r"]*(?:/>|>.*?)", r"]*(?:/>|>.*?)", ] def setup_logging(): logging.basicConfig( level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.StreamHandler(), ] ) def load_workbook_with_possible_passwords(filepath, passwords, keep_vba=False, data_only=False): try: wb = load_workbook(filename=filepath, keep_vba=keep_vba, data_only=data_only) logging.info(f"Successfully loaded workbook: {filepath}") return wb except Exception: logging.warning(f"Failed to load workbook normally, trying with passwords for: {filepath}") for pwd in passwords: try: decrypted = BytesIO() with open(filepath, 'rb') as f: office_file = msoffcrypto.OfficeFile(f) office_file.load_key(password=pwd.strip()) office_file.decrypt(decrypted) decrypted.seek(0) wb = load_workbook(filename=decrypted, keep_vba=keep_vba, data_only=data_only) logging.info(f"Successfully decrypted '{filepath}' with password: '{pwd.strip()}'") return wb except Exception: logging.debug(f"Password '{pwd.strip()}' did not work for '{filepath}'") continue raise ValueError(f"None of the provided passwords worked for '{filepath}'") def copy_excel_file(source_path, destination_path, passwords): logging.info(f"Processing Excel file: {source_path}") is_xlsm = source_path.lower().endswith('.xlsm') source_wb = load_workbook_with_possible_passwords( filepath=source_path, passwords=passwords, keep_vba=is_xlsm, data_only=False ) dest_wb = Workbook() if len(dest_wb.sheetnames) == 1 and dest_wb.active.title == 'Sheet': dest_wb.remove(dest_wb.active) for sheet_name in source_wb.sheetnames: logging.debug(f"Copying sheet: {sheet_name}") source_sheet = source_wb[sheet_name] # Create new sheet in destination workbook from openpyxl.chartsheet import Chartsheet if isinstance(source_sheet, Chartsheet): # For chartsheets, we need to create a worksheet instead dest_sheet = dest_wb.create_sheet(title=sheet_name) # Copy chart data if available if hasattr(source_sheet, 'chart') and source_sheet.chart: # Copy chart title if exists if hasattr(source_sheet.chart, 'title') and source_sheet.chart.title: dest_sheet['A1'] = f"Chart Title: {source_sheet.chart.title.text}" # Copy chart series data if exists if hasattr(source_sheet.chart, 'series'): row = 2 for idx, series in enumerate(source_sheet.chart.series, 1): # Write series title/name if hasattr(series, 'title'): dest_sheet.cell(row=row, column=1, value=f"Series {idx}: {series.title}") # Try to get values if available if hasattr(series, 'values'): try: for col, value in enumerate(series.values, 2): dest_sheet.cell(row=row, column=col, value=value) except: pass # Skip if values can't be accessed row += 1 else: # Regular worksheet handling dest_sheet = dest_wb.create_sheet(title=sheet_name) # Copy cell contents and styles for row in source_sheet.iter_rows(): for cell in row: dest_cell = dest_sheet.cell(row=cell.row, column=cell.column) dest_cell.value = cell.value if cell.has_style: dest_cell.font = copy(cell.font) dest_cell.border = copy(cell.border) dest_cell.fill = copy(cell.fill) dest_cell.number_format = cell.number_format dest_cell.protection = copy(cell.protection) dest_cell.alignment = copy(cell.alignment) # Copy sheet properties # Freeze panes if source_sheet.freeze_panes: dest_sheet.freeze_panes = source_sheet.freeze_panes # Column dimensions for col in source_sheet.column_dimensions: dest_sheet.column_dimensions[col] = copy(source_sheet.column_dimensions[col]) # Row dimensions for row in source_sheet.row_dimensions: dest_sheet.row_dimensions[row] = copy(source_sheet.row_dimensions[row]) # Sheet properties if hasattr(source_sheet, 'sheet_properties'): dest_sheet.sheet_properties = copy(source_sheet.sheet_properties) # Sheet view properties if hasattr(source_sheet, 'sheet_view'): dest_sheet.sheet_view = copy(source_sheet.sheet_view) # Merged cells if source_sheet.merged_cells: for merged_range in source_sheet.merged_cells: dest_sheet.merge_cells(str(merged_range)) os.makedirs(os.path.dirname(destination_path), exist_ok=True) logging.debug(f"Destination directory ensured: {os.path.dirname(destination_path)}") dest_wb.save(destination_path) logging.info(f"Saved copied file to: {destination_path}") dest_wb.close() source_wb.close() def remove_all_protection_tags(docx_path, output_path): logging.info(f"Processing Word document: {docx_path}") with zipfile.ZipFile(docx_path, 'r') as zip_in: file_list = zip_in.namelist() with zipfile.ZipFile(output_path, 'w', compression=zipfile.ZIP_DEFLATED) as zip_out: for item in tqdm(file_list, desc="Processing XML files"): data = zip_in.read(item) if item.lower().endswith('.xml'): text = data.decode('utf-8', errors='ignore') for pattern in PROTECTION_PATTERNS: text = re.sub(pattern, "", text, flags=re.DOTALL) data = text.encode('utf-8') zip_out.writestr(item, data) os.makedirs(os.path.dirname(output_path), exist_ok=True) logging.debug(f"Output directory ensured: {os.path.dirname(output_path)}") logging.info(f"Saved cleaned file to: {output_path}") def main(): setup_logging() print("\nChoose the file type to process:") print("1. Excel files") print("2. Word documents") choice = input("Enter your choice (1 or 2): ").strip() if choice == '1': source_dir = input("Enter the source folder with Excel files: ").strip() dest_dir = input("Enter the destination folder for copied files: ").strip() os.makedirs(dest_dir, exist_ok=True) password_option = input("Choose password option (file/single/none): ").strip().lower() passwords = [] if password_option == 'file': password_file = input("Enter the path to the password file: ").strip() with open(password_file, 'r', encoding='utf-8') as pf: passwords = [line.strip() for line in pf if line.strip()] elif password_option == 'single': single_password = input("Enter the password: ").strip() passwords = [single_password] files = [ os.path.join(root, file) for root, _, files in os.walk(source_dir) for file in files if file.lower().endswith(('.xlsx', '.xlsm')) ] for source_path in tqdm(files, desc="Copying Excel Files"): relative_path = os.path.relpath(os.path.dirname(source_path), source_dir) dest_path = os.path.join(dest_dir, relative_path, os.path.basename(source_path)) os.makedirs(os.path.dirname(dest_path), exist_ok=True) logging.debug(f"Ensured destination path: {dest_path}") try: copy_excel_file(source_path, dest_path, passwords) except Exception as e: logging.error(f"Failed to copy {source_path}: {e}") elif choice == '2': source_dir = input("Enter the source folder with Word files: ").strip() dest_dir = input("Enter the destination folder for cleaned files: ").strip() os.makedirs(dest_dir, exist_ok=True) files = [ os.path.join(root, file) for root, _, files in os.walk(source_dir) for file in files if file.lower().endswith(('.docx', '.docm')) ] for source_path in tqdm(files, desc="Removing Word Protections"): relative_path = os.path.relpath(os.path.dirname(source_path), source_dir) dest_path = os.path.join(dest_dir, relative_path, os.path.basename(source_path)) os.makedirs(os.path.dirname(dest_path), exist_ok=True) logging.debug(f"Ensured destination path: {dest_path}") try: remove_all_protection_tags(source_path, dest_path) except Exception as e: logging.error(f"Failed to clean {source_path}: {e}") else: logging.error("Invalid choice. Please restart the script and choose a valid option.") if __name__ == "__main__": main()