244 lines
10 KiB
Python
244 lines
10 KiB
Python
import os
|
|
import logging
|
|
import warnings
|
|
from tqdm import tqdm
|
|
from openpyxl import Workbook, load_workbook
|
|
import msoffcrypto
|
|
from io import BytesIO
|
|
from copy import copy, deepcopy
|
|
import zipfile
|
|
import re
|
|
|
|
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl.reader.workbook')
|
|
|
|
PROTECTION_PATTERNS = [
|
|
r"<w:documentProtection[^>]*(?:/>|>.*?</w:documentProtection>)",
|
|
r"<w:writeProtection[^>]*(?:/>|>.*?</w:writeProtection>)",
|
|
r"<w:commentProtection[^>]*(?:/>|>.*?</w:commentProtection>)",
|
|
r"<w:revisionProtection[^>]*(?:/>|>.*?</w:revisionProtection>)",
|
|
r"<w:trackRevisions[^>]*(?:/>|>.*?</w:trackRevisions>)",
|
|
]
|
|
|
|
def setup_logging():
|
|
logging.basicConfig(
|
|
level=logging.DEBUG,
|
|
format='%(asctime)s [%(levelname)s] %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(),
|
|
]
|
|
)
|
|
|
|
def load_workbook_with_possible_passwords(filepath, passwords, keep_vba=False, data_only=False):
|
|
try:
|
|
wb = load_workbook(filename=filepath, keep_vba=keep_vba, data_only=data_only)
|
|
logging.info(f"Successfully loaded workbook: {filepath}")
|
|
return wb
|
|
except Exception:
|
|
logging.warning(f"Failed to load workbook normally, trying with passwords for: {filepath}")
|
|
|
|
for pwd in passwords:
|
|
try:
|
|
decrypted = BytesIO()
|
|
with open(filepath, 'rb') as f:
|
|
office_file = msoffcrypto.OfficeFile(f)
|
|
office_file.load_key(password=pwd.strip())
|
|
office_file.decrypt(decrypted)
|
|
|
|
decrypted.seek(0)
|
|
wb = load_workbook(filename=decrypted, keep_vba=keep_vba, data_only=data_only)
|
|
logging.info(f"Successfully decrypted '{filepath}' with password: '{pwd.strip()}'")
|
|
return wb
|
|
except Exception:
|
|
logging.debug(f"Password '{pwd.strip()}' did not work for '{filepath}'")
|
|
continue
|
|
|
|
raise ValueError(f"None of the provided passwords worked for '{filepath}'")
|
|
|
|
def copy_excel_file(source_path, destination_path, passwords):
|
|
logging.info(f"Processing Excel file: {source_path}")
|
|
is_xlsm = source_path.lower().endswith('.xlsm')
|
|
source_wb = load_workbook_with_possible_passwords(
|
|
filepath=source_path,
|
|
passwords=passwords,
|
|
keep_vba=is_xlsm,
|
|
data_only=False
|
|
)
|
|
|
|
dest_wb = Workbook()
|
|
if len(dest_wb.sheetnames) == 1 and dest_wb.active.title == 'Sheet':
|
|
dest_wb.remove(dest_wb.active)
|
|
|
|
for sheet_name in source_wb.sheetnames:
|
|
logging.debug(f"Copying sheet: {sheet_name}")
|
|
source_sheet = source_wb[sheet_name]
|
|
|
|
# Create new sheet in destination workbook
|
|
from openpyxl.chartsheet import Chartsheet
|
|
if isinstance(source_sheet, Chartsheet):
|
|
# For chartsheets, we need to create a worksheet instead
|
|
dest_sheet = dest_wb.create_sheet(title=sheet_name)
|
|
|
|
# Copy chart data if available
|
|
if hasattr(source_sheet, 'chart') and source_sheet.chart:
|
|
# Copy chart title if exists
|
|
if hasattr(source_sheet.chart, 'title') and source_sheet.chart.title:
|
|
dest_sheet['A1'] = f"Chart Title: {source_sheet.chart.title.text}"
|
|
|
|
# Copy chart series data if exists
|
|
if hasattr(source_sheet.chart, 'series'):
|
|
row = 2
|
|
for idx, series in enumerate(source_sheet.chart.series, 1):
|
|
# Write series title/name
|
|
if hasattr(series, 'title'):
|
|
dest_sheet.cell(row=row, column=1, value=f"Series {idx}: {series.title}")
|
|
|
|
# Try to get values if available
|
|
if hasattr(series, 'values'):
|
|
try:
|
|
for col, value in enumerate(series.values, 2):
|
|
dest_sheet.cell(row=row, column=col, value=value)
|
|
except:
|
|
pass # Skip if values can't be accessed
|
|
|
|
row += 1
|
|
else:
|
|
# Regular worksheet handling
|
|
dest_sheet = dest_wb.create_sheet(title=sheet_name)
|
|
|
|
# Copy cell contents and styles
|
|
for row in source_sheet.iter_rows():
|
|
for cell in row:
|
|
dest_cell = dest_sheet.cell(row=cell.row, column=cell.column)
|
|
dest_cell.value = cell.value
|
|
if cell.has_style:
|
|
dest_cell.font = copy(cell.font)
|
|
dest_cell.border = copy(cell.border)
|
|
dest_cell.fill = copy(cell.fill)
|
|
dest_cell.number_format = cell.number_format
|
|
dest_cell.protection = copy(cell.protection)
|
|
dest_cell.alignment = copy(cell.alignment)
|
|
|
|
# Copy sheet properties
|
|
# Freeze panes
|
|
if source_sheet.freeze_panes:
|
|
dest_sheet.freeze_panes = source_sheet.freeze_panes
|
|
|
|
# Column dimensions
|
|
for col in source_sheet.column_dimensions:
|
|
dest_sheet.column_dimensions[col] = copy(source_sheet.column_dimensions[col])
|
|
|
|
# Row dimensions
|
|
for row in source_sheet.row_dimensions:
|
|
dest_sheet.row_dimensions[row] = copy(source_sheet.row_dimensions[row])
|
|
|
|
# Sheet properties
|
|
if hasattr(source_sheet, 'sheet_properties'):
|
|
dest_sheet.sheet_properties = copy(source_sheet.sheet_properties)
|
|
|
|
# Sheet view properties
|
|
if hasattr(source_sheet, 'sheet_view'):
|
|
dest_sheet.sheet_view = copy(source_sheet.sheet_view)
|
|
|
|
# Merged cells
|
|
if source_sheet.merged_cells:
|
|
for merged_range in source_sheet.merged_cells:
|
|
dest_sheet.merge_cells(str(merged_range))
|
|
|
|
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
|
|
logging.debug(f"Destination directory ensured: {os.path.dirname(destination_path)}")
|
|
|
|
dest_wb.save(destination_path)
|
|
logging.info(f"Saved copied file to: {destination_path}")
|
|
dest_wb.close()
|
|
source_wb.close()
|
|
|
|
def remove_all_protection_tags(docx_path, output_path):
|
|
logging.info(f"Processing Word document: {docx_path}")
|
|
with zipfile.ZipFile(docx_path, 'r') as zip_in:
|
|
file_list = zip_in.namelist()
|
|
|
|
with zipfile.ZipFile(output_path, 'w', compression=zipfile.ZIP_DEFLATED) as zip_out:
|
|
for item in tqdm(file_list, desc="Processing XML files"):
|
|
data = zip_in.read(item)
|
|
|
|
if item.lower().endswith('.xml'):
|
|
text = data.decode('utf-8', errors='ignore')
|
|
for pattern in PROTECTION_PATTERNS:
|
|
text = re.sub(pattern, "", text, flags=re.DOTALL)
|
|
data = text.encode('utf-8')
|
|
|
|
zip_out.writestr(item, data)
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
logging.debug(f"Output directory ensured: {os.path.dirname(output_path)}")
|
|
logging.info(f"Saved cleaned file to: {output_path}")
|
|
|
|
def main():
|
|
setup_logging()
|
|
print("\nChoose the file type to process:")
|
|
print("1. Excel files")
|
|
print("2. Word documents")
|
|
choice = input("Enter your choice (1 or 2): ").strip()
|
|
|
|
if choice == '1':
|
|
source_dir = input("Enter the source folder with Excel files: ").strip()
|
|
dest_dir = input("Enter the destination folder for copied files: ").strip()
|
|
os.makedirs(dest_dir, exist_ok=True)
|
|
|
|
password_option = input("Choose password option (file/single/none): ").strip().lower()
|
|
passwords = []
|
|
|
|
if password_option == 'file':
|
|
password_file = input("Enter the path to the password file: ").strip()
|
|
with open(password_file, 'r', encoding='utf-8') as pf:
|
|
passwords = [line.strip() for line in pf if line.strip()]
|
|
elif password_option == 'single':
|
|
single_password = input("Enter the password: ").strip()
|
|
passwords = [single_password]
|
|
|
|
files = [
|
|
os.path.join(root, file)
|
|
for root, _, files in os.walk(source_dir)
|
|
for file in files
|
|
if file.lower().endswith(('.xlsx', '.xlsm'))
|
|
]
|
|
|
|
for source_path in tqdm(files, desc="Copying Excel Files"):
|
|
relative_path = os.path.relpath(os.path.dirname(source_path), source_dir)
|
|
dest_path = os.path.join(dest_dir, relative_path, os.path.basename(source_path))
|
|
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
|
|
logging.debug(f"Ensured destination path: {dest_path}")
|
|
|
|
try:
|
|
copy_excel_file(source_path, dest_path, passwords)
|
|
except Exception as e:
|
|
logging.error(f"Failed to copy {source_path}: {e}")
|
|
|
|
elif choice == '2':
|
|
source_dir = input("Enter the source folder with Word files: ").strip()
|
|
dest_dir = input("Enter the destination folder for cleaned files: ").strip()
|
|
os.makedirs(dest_dir, exist_ok=True)
|
|
|
|
files = [
|
|
os.path.join(root, file)
|
|
for root, _, files in os.walk(source_dir)
|
|
for file in files
|
|
if file.lower().endswith(('.docx', '.docm'))
|
|
]
|
|
|
|
for source_path in tqdm(files, desc="Removing Word Protections"):
|
|
relative_path = os.path.relpath(os.path.dirname(source_path), source_dir)
|
|
dest_path = os.path.join(dest_dir, relative_path, os.path.basename(source_path))
|
|
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
|
|
logging.debug(f"Ensured destination path: {dest_path}")
|
|
|
|
try:
|
|
remove_all_protection_tags(source_path, dest_path)
|
|
except Exception as e:
|
|
logging.error(f"Failed to clean {source_path}: {e}")
|
|
|
|
else:
|
|
logging.error("Invalid choice. Please restart the script and choose a valid option.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|