"""PowerPoint to PDF converter using LibreOffice.""" import os import subprocess import time import logging from typing import Optional logger = logging.getLogger(__name__) def cleanup_libreoffice_processes() -> None: """Clean up any hanging LibreOffice processes.""" try: subprocess.run(['pkill', '-f', 'soffice'], capture_output=True, timeout=10) time.sleep(1) # Give processes time to terminate except Exception as e: logger.warning(f"Failed to cleanup LibreOffice processes: {e}") def pptx_to_pdf_libreoffice(pptx_path: str, output_dir: str) -> Optional[str]: """Convert PPTX to PDF using LibreOffice for highest quality. This function is the core component of the PPTX processing workflow: PPTX → PDF (this function) → JPG images (handled in uploads.py) Args: pptx_path: Path to the PPTX file output_dir: Directory to save the PDF Returns: Path to the generated PDF file, or None if conversion failed """ try: # Clean up any existing LibreOffice processes cleanup_libreoffice_processes() # Ensure output directory exists os.makedirs(output_dir, exist_ok=True) # Use LibreOffice to convert PPTX to PDF cmd = [ 'libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', output_dir, '--invisible', '--nodefault', pptx_path ] logger.info(f"Converting PPTX to PDF using LibreOffice: {pptx_path}") # Increase timeout to 300 seconds (5 minutes) for large presentations result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) if result.returncode != 0: logger.error(f"LibreOffice conversion failed: {result.stderr}") logger.error(f"LibreOffice stdout: {result.stdout}") cleanup_libreoffice_processes() return None # Find the generated PDF file base_name = os.path.splitext(os.path.basename(pptx_path))[0] pdf_path = os.path.join(output_dir, f"{base_name}.pdf") if os.path.exists(pdf_path): logger.info(f"PDF conversion successful: {pdf_path}") cleanup_libreoffice_processes() return pdf_path else: logger.error(f"PDF file not found after conversion: {pdf_path}") cleanup_libreoffice_processes() return None except subprocess.TimeoutExpired: logger.error("LibreOffice conversion timed out (300s)") cleanup_libreoffice_processes() return None except Exception as e: logger.error(f"Error in PPTX to PDF conversion: {e}") cleanup_libreoffice_processes() return None def validate_pptx_file(filepath: str) -> bool: """Validate if file is a valid PowerPoint file. Args: filepath: Path to file to validate Returns: True if valid PPTX file, False otherwise """ if not os.path.exists(filepath): return False # Check file extension if not filepath.lower().endswith(('.ppt', '.pptx')): return False # Check file size (must be > 0) if os.path.getsize(filepath) == 0: return False return True