Files
digiserver/app/utils/pptx_converter.py

112 lines
3.9 KiB
Python

"""
PPTX to PDF converter using LibreOffice for high-quality conversion
This module provides the essential function to convert PowerPoint presentations to PDF
using LibreOffice headless mode for professional-grade quality.
The converted PDF is then processed by the main upload workflow for 4K image generation.
"""
import os
import subprocess
import logging
import signal
import time
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def cleanup_libreoffice_processes():
"""Clean up any hanging LibreOffice processes"""
try:
subprocess.run(['pkill', '-f', 'soffice'], capture_output=True, timeout=10)
time.sleep(1) # Give processes time to terminate
except Exception as e:
logger.warning(f"Failed to cleanup LibreOffice processes: {e}")
def pptx_to_pdf_libreoffice(pptx_path, output_dir):
"""
Convert PPTX to PDF using LibreOffice for highest quality.
This function is the core component of the PPTX processing workflow:
PPTX → PDF (this function) → 4K JPG images (handled in uploads.py)
Args:
pptx_path (str): Path to the PPTX file
output_dir (str): Directory to save the PDF
Returns:
str: Path to the generated PDF file, or None if conversion failed
"""
try:
# Clean up any existing LibreOffice processes
cleanup_libreoffice_processes()
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
# Use LibreOffice to convert PPTX to PDF
cmd = [
'libreoffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', output_dir,
'--invisible', # Run without any UI
'--nodefault', # Don't start with default template
pptx_path
]
logger.info(f"Converting PPTX to PDF using LibreOffice: {pptx_path}")
# Increase timeout to 300 seconds (5 minutes) for large presentations
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
if result.returncode != 0:
logger.error(f"LibreOffice conversion failed: {result.stderr}")
logger.error(f"LibreOffice stdout: {result.stdout}")
cleanup_libreoffice_processes() # Clean up on failure
return None
# Find the generated PDF file
base_name = os.path.splitext(os.path.basename(pptx_path))[0]
pdf_path = os.path.join(output_dir, f"{base_name}.pdf")
if os.path.exists(pdf_path):
logger.info(f"PDF conversion successful: {pdf_path}")
cleanup_libreoffice_processes() # Clean up after success
return pdf_path
else:
logger.error(f"PDF file not found after conversion: {pdf_path}")
cleanup_libreoffice_processes() # Clean up on failure
return None
except subprocess.TimeoutExpired:
logger.error("LibreOffice conversion timed out (300s)")
cleanup_libreoffice_processes() # Clean up on timeout
return None
except Exception as e:
logger.error(f"Error in PPTX to PDF conversion: {e}")
import traceback
logger.error(f"Traceback: {traceback.format_exc()}")
cleanup_libreoffice_processes() # Clean up on error
return None
if __name__ == "__main__":
# Test the converter
import sys
if len(sys.argv) > 1:
test_pptx = sys.argv[1]
if os.path.exists(test_pptx):
output_dir = "test_output"
pdf_result = pptx_to_pdf_libreoffice(test_pptx, output_dir)
if pdf_result:
print(f"Successfully converted PPTX to PDF: {pdf_result}")
else:
print("PPTX to PDF conversion failed")
else:
print(f"File not found: {test_pptx}")
else:
print("Usage: python pptx_converter.py <pptx_file>")