diff --git a/combine.py b/combine.py index e96869e..3575368 100644 --- a/combine.py +++ b/combine.py @@ -1,159 +1,246 @@ import os import datetime -from typing import List, Optional +import logging +import re +import tempfile +import zipfile +from concurrent.futures import ThreadPoolExecutor +from typing import List +import threading + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') class FileCombiner: + """Combines multiple files into a single output file, with various options.""" + def __init__(self): - self.extensions = ['.php'] - self.output_file = "combined_files.txt" self.source_dir = "." + self.output_file = "combined_files.txt" + self.extensions = [] # Empty list means all extensions are included + self.exclude_folders = ['.git'] + self.exclude_patterns = [] self.include_line_numbers = False self.include_timestamp = False - self.add_syntax_highlight = False - self.exclude_folders: List[str] = [] self.include_file_size = False + self.add_syntax_highlight = False # Requires manual language specification in output self.max_file_size_mb = None - + self.create_zip_archive = False + self.exclude_images = False # Basic image extension check, not fully reliable + self.exclude_executable = False + self.exclude_temp_and_backup_files = False + self.exclude_hidden_files = False + self.num_worker_threads = 4 + self.lock = threading.Lock() + + def get_user_preferences(self): - """Interactive menu to get user preferences""" + """Interactively gets user preferences for file combination.""" + print("\n=== File Combiner Configuration ===") - - # Source directory - print("\n1. Select source directory:") - user_dir = input(f"Enter directory path (press Enter for current directory): ").strip() - if user_dir and os.path.isdir(user_dir): - self.source_dir = user_dir - - # File extensions - print("\n2. Select file extensions to combine:") - print("Common options: 1) PHP files 2) Python files 3) All files 4) Custom") - ext_choice = input("Enter your choice (1-4): ").strip() - if ext_choice == "1": - self.extensions = ['.php'] - elif ext_choice == "2": - self.extensions = ['.py'] - elif ext_choice == "3": - self.extensions = [] # Empty list means all files - elif ext_choice == "4": - exts = input("Enter extensions separated by comma (e.g., .php,.py,.js): ").strip() - self.extensions = [ext.strip() for ext in exts.split(',') if ext.strip()] - - # Output file - print("\n3. Configure output:") - output_name = input("Enter output filename (press Enter for 'combined_files.txt'): ").strip() - if output_name: - self.output_file = output_name if output_name.endswith('.txt') else output_name + '.txt' - - # Additional options - print("\n4. Additional options (enter y/n for each):") - self.include_line_numbers = input("Include line numbers? ").lower().startswith('y') - self.include_timestamp = input("Include file timestamps? ").lower().startswith('y') - self.include_file_size = input("Include file sizes? ").lower().startswith('y') - self.add_syntax_highlight = input("Add syntax highlighting markers? ").lower().startswith('y') - - # Exclusions - print("\n5. Exclusion options:") - if input("Do you want to exclude specific folders? (y/n) ").lower().startswith('y'): - folders = input("Enter folder names to exclude (comma-separated): ").strip() - self.exclude_folders = [f.strip() for f in folders.split(',') if f.strip()] - - # File size limit - if input("\nDo you want to set a maximum file size limit? (y/n) ").lower().startswith('y'): + + self.source_dir = self._get_input("Source directory (default: .): ", self.source_dir, os.path.isdir) + self.output_file = self._get_input("Output file name (default: combined_files.txt): ", self.output_file) + self.extensions = self._get_list_input("File extensions to include (comma-separated, or Enter for all): ") + self.exclude_folders = self._get_list_input("Folders to exclude (comma-separated, default: .git): ", self.exclude_folders) + self.exclude_patterns = self._get_list_input("Regex patterns to exclude (comma-separated): ") + + self.include_line_numbers = self._get_boolean_input("Include line numbers? (y/n): ") + self.include_timestamp = self._get_boolean_input("Include timestamps? (y/n): ") + self.include_file_size = self._get_boolean_input("Include file sizes? (y/n): ") + self.add_syntax_highlight = self._get_boolean_input("Add syntax highlighting (requires manual language spec)? (y/n): ") # Clarify manual aspect + self.max_file_size_mb = self._get_float_input("Max file size to include (MB, or Enter for no limit): ") + self.create_zip_archive = self._get_boolean_input("Create zip archive of output? (y/n): ") + self.exclude_images = self._get_boolean_input("Exclude common image files (basic check, not fully reliable)? (y/n): ") + self.exclude_executable = self._get_boolean_input("Exclude executable files? (y/n): ") + self.exclude_temp_and_backup_files = self._get_boolean_input("Exclude temp/backup files? (y/n): ") + self.exclude_hidden_files = self._get_boolean_input("Exclude hidden files? (y/n): ") + self.num_worker_threads = self._get_int_input("Number of worker threads (default: 4): ", 4, lambda x: x > 0) + + + def _get_input(self, prompt: str, default: str = None, validator=lambda x: True) -> str: + while True: + value = input(prompt).strip() + if not value: + return default + if validator(value): + return value + print("Invalid input.") + + + def _get_list_input(self, prompt: str, default: list = None) -> list: + value = input(prompt).strip() + if not value: + return default or [] + return [x.strip() for x in value.split(',') if x.strip()] + + + def _get_boolean_input(self, prompt: str) -> bool: + return self._get_input(prompt + " (y/n): ", "n", lambda x: x.lower() in ('y', 'n')) == 'y' + + + def _get_float_input(self, prompt: str) -> float or None: # returns float or None for no input + while True: + value = input(prompt).strip() + if not value: + return None # Allow no input, return None + try: + return float(value) + except ValueError: + print("Invalid input. Please enter a number.") + + + def _get_int_input(self, prompt: str, default: int, validator=lambda x: True) -> int: + while True: + value = input(prompt).strip() + if not value: + return default try: - self.max_file_size_mb = float(input("Enter maximum file size in MB: ")) + int_value = int(value) + if validator(int_value): + return int_value + else: + print("Invalid input. Value does not meet criteria.") except ValueError: - print("Invalid input. No file size limit will be applied.") + print("Invalid input. Please enter an integer.") + def should_process_file(self, filepath: str) -> bool: - """Check if file should be processed based on settings""" - # Check extensions + """Determines whether a file should be included in the combination based on user settings.""" + if self.extensions and not any(filepath.endswith(ext) for ext in self.extensions): return False - - # Check excluded folders if any(folder in filepath for folder in self.exclude_folders): return False - - # Check file size - if self.max_file_size_mb: - size_mb = os.path.getsize(filepath) / (1024 * 1024) - if size_mb > self.max_file_size_mb: + if self.exclude_patterns and any(re.match(pattern, filepath) for pattern in self.exclude_patterns): + return False + try: + file_size = os.path.getsize(filepath) + if self.max_file_size_mb and file_size > self.max_file_size_mb * 1024 * 1024: return False - + except OSError: + logging.warning(f"Could not get size of {filepath}. Skipping.") + return False + if self.exclude_images and any(filepath.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']): + return False # Basic image file extension check. + if self.exclude_executable and os.access(filepath, os.X_OK): + return False + if self.exclude_temp_and_backup_files and (filepath.startswith(tempfile.gettempdir()) or any(filepath.endswith(ext) for ext in ['.tmp', '.temp', '.bak', '~'])): + return False + if self.exclude_hidden_files and os.path.basename(filepath).startswith('.'): + return False + return True + def _process_file(self, filepath: str, outfile): + """Processes and writes a single file to the output.""" + try: + with open(filepath, 'r', encoding='utf-8') as infile: + content = infile.read() + with self.lock: # Lock to prevent race conditions with multithreading + self._write_file_header(outfile, filepath) + if self.add_syntax_highlight: + ext = os.path.splitext(filepath)[1] + outfile.write(f"```{ext[1:] if ext else ''}\n") # Manual language specification + if self.include_line_numbers: + for i, line in enumerate(content.splitlines(), 1): + outfile.write(f"{i:4d} | {line}\n") + else: + outfile.write(content) + if self.add_syntax_highlight: + outfile.write("```\n") + outfile.write("\n") + + return 1, os.path.getsize(filepath) # Return file count and size + except Exception as e: + logging.error(f"Error reading {filepath}: {e}") + with self.lock: + outfile.write(f"Error reading {filepath}: {e}\n\n") + return 0, 0 # File not read, size is 0. + + def _write_summary(self, outfile): + """Writes the initial summary information to the output file.""" + outfile.write("=== File Combination Summary ===\n") + outfile.write(f"Generated on: {datetime.datetime.now()}\n") + outfile.write(f"Source directory: {os.path.abspath(self.source_dir)}\n") + + outfile.write(f"Included extensions: {', '.join(self.extensions) if self.extensions else 'All'}\n") + outfile.write(f"Excluded folders: {', '.join(self.exclude_folders)}\n") + + outfile.write("=" * 80 + "\n\n") + + def _write_file_header(self, outfile, filepath): + outfile.write("=" * 80 + "\n") + outfile.write(f"File: {os.path.relpath(filepath, self.source_dir)}\n") # Use relative path + if self.include_timestamp: + timestamp = datetime.datetime.fromtimestamp(os.path.getmtime(filepath)) + outfile.write(f"Last Modified: {timestamp}\n") + + if self.include_file_size: + size = os.path.getsize(filepath) + outfile.write(f"Size: {size / 1024:.2f} KB\n") + outfile.write("=" * 80 + "\n\n") + + + def _write_combination_summary(self, outfile, files_processed, total_size): + """Writes the final combination summary to the output file.""" + + outfile.write("=" * 80 + "\n") + outfile.write(f"Total files processed: {files_processed}\n") + outfile.write(f"Total size: {total_size / 1024 / 1024:.2f} MB\n") + + def combine_files(self): - """Combine files according to user preferences""" + """Combines the files according to the user preferences.""" + + try: with open(self.output_file, 'w', encoding='utf-8') as outfile: - # Write configuration summary - outfile.write("=== File Combination Summary ===\n") - outfile.write(f"Generated on: {datetime.datetime.now()}\n") - outfile.write(f"Source directory: {os.path.abspath(self.source_dir)}\n") - outfile.write(f"File types: {', '.join(self.extensions) if self.extensions else 'All files'}\n") - outfile.write("=" * 80 + "\n\n") - - files_processed = 0 - total_size = 0 - - for dirpath, dirnames, filenames in os.walk(self.source_dir): - # Remove excluded folders - dirnames[:] = [d for d in dirnames if d not in self.exclude_folders] - + self._write_summary(outfile) + + + file_paths = [] + for dirpath, dirnames, filenames in os.walk(self.source_dir, followlinks=False): # followlinks=False added + dirnames[:] = [d for d in dirnames if d not in self.exclude_folders] # Exclude specified directories for filename in filenames: filepath = os.path.join(dirpath, filename) - - if not self.should_process_file(filepath): - continue - - # Write file header - outfile.write("\n" + "=" * 80 + "\n") - outfile.write(f"File: {filename}\n") - outfile.write(f"Location: {os.path.relpath(filepath, self.source_dir)}\n") - - if self.include_timestamp: - timestamp = datetime.datetime.fromtimestamp(os.path.getmtime(filepath)) - outfile.write(f"Last modified: {timestamp}\n") - - if self.include_file_size: - size = os.path.getsize(filepath) - outfile.write(f"Size: {size/1024:.2f} KB\n") - total_size += size - - outfile.write("=" * 80 + "\n\n") - - # Write file content - try: - with open(filepath, 'r', encoding='utf-8') as infile: - if self.add_syntax_highlight: - ext = os.path.splitext(filename)[1] - outfile.write(f"```{ext[1:] if ext else ''}\n") - - if self.include_line_numbers: - for i, line in enumerate(infile, 1): - outfile.write(f"{i:4d} | {line}") - else: - outfile.write(infile.read()) - - if self.add_syntax_highlight: - outfile.write("\n```\n") - - outfile.write("\n") - files_processed += 1 - - except Exception as e: - outfile.write(f"Error reading file: {str(e)}\n") - - # Write summary at the end - outfile.write("\n" + "=" * 80 + "\n") - outfile.write(f"Total files processed: {files_processed}\n") - outfile.write(f"Total size: {total_size/1024/1024:.2f} MB\n") - - print(f"\nSuccessfully combined {files_processed} files into {self.output_file}") - print(f"Total size: {total_size/1024/1024:.2f} MB") - + if self.should_process_file(filepath): + file_paths.append(filepath) + + with ThreadPoolExecutor(max_workers=self.num_worker_threads) as executor: + results = executor.map(self._process_file, file_paths, [outfile] * len(file_paths)) + files_processed, total_size = sum([r[0] for r in results]), sum([r[1] for r in results]) + + self._write_combination_summary(outfile, files_processed, total_size) + + if self.create_zip_archive: + self._create_zip_archive() + + logging.info(f"Combined {files_processed} files into {self.output_file}") + logging.info(f"Total size: {total_size / 1024 / 1024:.2f} MB") + + print(f"\nCombined {files_processed} files into {self.output_file}") + print(f"Total size: {total_size / 1024 / 1024:.2f} MB") + except Exception as e: - print(f"An error occurred: {str(e)}") + logging.error(f"An error occurred: {e}") + print(f"An error occurred: {e}") + + + + def _create_zip_archive(self): + zip_filename = self.output_file.replace('.txt', '.zip') + try: + with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + zipf.write(self.output_file, arcname=os.path.basename(self.output_file)) # Use arcname for correct filename in zip + logging.info(f"Created zip archive: {zip_filename}") + print(f"Created zip archive: {zip_filename}") + + except Exception as e: + logging.error(f"Error creating zip archive: {e}") + print(f"Error creating zip archive: {e}") + + def main(): combiner = FileCombiner() @@ -161,4 +248,4 @@ def main(): combiner.combine_files() if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/combined_files.txt b/combined_files.txt new file mode 100644 index 0000000..a4d6252 --- /dev/null +++ b/combined_files.txt @@ -0,0 +1,670 @@ +=== File Combination Summary === +Generated on: 2024-10-30 10:26:39.975657 +Source directory: C:\Users\g1n\Documents\Github\textcodecombiner +Included extensions: All +Excluded folders: .git +================================================================================ + +================================================================================ +File: combine.py +================================================================================ + +import os +import datetime +import logging +import re +import tempfile +import zipfile +from concurrent.futures import ThreadPoolExecutor +from typing import List +import threading + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +class FileCombiner: + """Combines multiple files into a single output file, with various options.""" + + def __init__(self): + self.source_dir = "." + self.output_file = "combined_files.txt" + self.extensions = [] # Empty list means all extensions are included + self.exclude_folders = ['.git'] + self.exclude_patterns = [] + self.include_line_numbers = False + self.include_timestamp = False + self.include_file_size = False + self.add_syntax_highlight = False # Requires manual language specification in output + self.max_file_size_mb = None + self.create_zip_archive = False + self.exclude_images = False # Basic image extension check, not fully reliable + self.exclude_executable = False + self.exclude_temp_and_backup_files = False + self.exclude_hidden_files = False + self.num_worker_threads = 4 + self.lock = threading.Lock() + + + def get_user_preferences(self): + """Interactively gets user preferences for file combination.""" + + print("\n=== File Combiner Configuration ===") + + self.source_dir = self._get_input("Source directory (default: .): ", self.source_dir, os.path.isdir) + self.output_file = self._get_input("Output file name (default: combined_files.txt): ", self.output_file) + self.extensions = self._get_list_input("File extensions to include (comma-separated, or Enter for all): ") + self.exclude_folders = self._get_list_input("Folders to exclude (comma-separated, default: .git): ", self.exclude_folders) + self.exclude_patterns = self._get_list_input("Regex patterns to exclude (comma-separated): ") + + self.include_line_numbers = self._get_boolean_input("Include line numbers? (y/n): ") + self.include_timestamp = self._get_boolean_input("Include timestamps? (y/n): ") + self.include_file_size = self._get_boolean_input("Include file sizes? (y/n): ") + self.add_syntax_highlight = self._get_boolean_input("Add syntax highlighting (requires manual language spec)? (y/n): ") # Clarify manual aspect + self.max_file_size_mb = self._get_float_input("Max file size to include (MB, or Enter for no limit): ") + self.create_zip_archive = self._get_boolean_input("Create zip archive of output? (y/n): ") + self.exclude_images = self._get_boolean_input("Exclude common image files (basic check, not fully reliable)? (y/n): ") + self.exclude_executable = self._get_boolean_input("Exclude executable files? (y/n): ") + self.exclude_temp_and_backup_files = self._get_boolean_input("Exclude temp/backup files? (y/n): ") + self.exclude_hidden_files = self._get_boolean_input("Exclude hidden files? (y/n): ") + self.num_worker_threads = self._get_int_input("Number of worker threads (default: 4): ", 4, lambda x: x > 0) + + + def _get_input(self, prompt: str, default: str = None, validator=lambda x: True) -> str: + while True: + value = input(prompt).strip() + if not value: + return default + if validator(value): + return value + print("Invalid input.") + + + def _get_list_input(self, prompt: str, default: list = None) -> list: + value = input(prompt).strip() + if not value: + return default or [] + return [x.strip() for x in value.split(',') if x.strip()] + + + def _get_boolean_input(self, prompt: str) -> bool: + return self._get_input(prompt + " (y/n): ", "n", lambda x: x.lower() in ('y', 'n')) == 'y' + + + def _get_float_input(self, prompt: str) -> float or None: # returns float or None for no input + while True: + value = input(prompt).strip() + if not value: + return None # Allow no input, return None + try: + return float(value) + except ValueError: + print("Invalid input. Please enter a number.") + + + def _get_int_input(self, prompt: str, default: int, validator=lambda x: True) -> int: + while True: + value = input(prompt).strip() + if not value: + return default + try: + int_value = int(value) + if validator(int_value): + return int_value + else: + print("Invalid input. Value does not meet criteria.") + except ValueError: + print("Invalid input. Please enter an integer.") + + + def should_process_file(self, filepath: str) -> bool: + """Determines whether a file should be included in the combination based on user settings.""" + + if self.extensions and not any(filepath.endswith(ext) for ext in self.extensions): + return False + if any(folder in filepath for folder in self.exclude_folders): + return False + if self.exclude_patterns and any(re.match(pattern, filepath) for pattern in self.exclude_patterns): + return False + try: + file_size = os.path.getsize(filepath) + if self.max_file_size_mb and file_size > self.max_file_size_mb * 1024 * 1024: + return False + except OSError: + logging.warning(f"Could not get size of {filepath}. Skipping.") + return False + if self.exclude_images and any(filepath.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']): + return False # Basic image file extension check. + if self.exclude_executable and os.access(filepath, os.X_OK): + return False + if self.exclude_temp_and_backup_files and (filepath.startswith(tempfile.gettempdir()) or any(filepath.endswith(ext) for ext in ['.tmp', '.temp', '.bak', '~'])): + return False + if self.exclude_hidden_files and os.path.basename(filepath).startswith('.'): + return False + + return True + + def _process_file(self, filepath: str, outfile): + """Processes and writes a single file to the output.""" + try: + with open(filepath, 'r', encoding='utf-8') as infile: + content = infile.read() + with self.lock: # Lock to prevent race conditions with multithreading + self._write_file_header(outfile, filepath) + if self.add_syntax_highlight: + ext = os.path.splitext(filepath)[1] + outfile.write(f"```{ext[1:] if ext else ''}\n") # Manual language specification + if self.include_line_numbers: + for i, line in enumerate(content.splitlines(), 1): + outfile.write(f"{i:4d} | {line}\n") + else: + outfile.write(content) + if self.add_syntax_highlight: + outfile.write("```\n") + outfile.write("\n") + + return 1, os.path.getsize(filepath) # Return file count and size + except Exception as e: + logging.error(f"Error reading {filepath}: {e}") + with self.lock: + outfile.write(f"Error reading {filepath}: {e}\n\n") + return 0, 0 # File not read, size is 0. + + def _write_summary(self, outfile): + """Writes the initial summary information to the output file.""" + outfile.write("=== File Combination Summary ===\n") + outfile.write(f"Generated on: {datetime.datetime.now()}\n") + outfile.write(f"Source directory: {os.path.abspath(self.source_dir)}\n") + + outfile.write(f"Included extensions: {', '.join(self.extensions) if self.extensions else 'All'}\n") + outfile.write(f"Excluded folders: {', '.join(self.exclude_folders)}\n") + + outfile.write("=" * 80 + "\n\n") + + def _write_file_header(self, outfile, filepath): + outfile.write("=" * 80 + "\n") + outfile.write(f"File: {os.path.relpath(filepath, self.source_dir)}\n") # Use relative path + if self.include_timestamp: + timestamp = datetime.datetime.fromtimestamp(os.path.getmtime(filepath)) + outfile.write(f"Last Modified: {timestamp}\n") + + if self.include_file_size: + size = os.path.getsize(filepath) + outfile.write(f"Size: {size / 1024:.2f} KB\n") + outfile.write("=" * 80 + "\n\n") + + + def _write_combination_summary(self, outfile, files_processed, total_size): + """Writes the final combination summary to the output file.""" + + outfile.write("=" * 80 + "\n") + outfile.write(f"Total files processed: {files_processed}\n") + outfile.write(f"Total size: {total_size / 1024 / 1024:.2f} MB\n") + + + def combine_files(self): + """Combines the files according to the user preferences.""" + + + try: + with open(self.output_file, 'w', encoding='utf-8') as outfile: + self._write_summary(outfile) + + + file_paths = [] + for dirpath, dirnames, filenames in os.walk(self.source_dir, followlinks=False): # followlinks=False added + dirnames[:] = [d for d in dirnames if d not in self.exclude_folders] # Exclude specified directories + for filename in filenames: + filepath = os.path.join(dirpath, filename) + if self.should_process_file(filepath): + file_paths.append(filepath) + + with ThreadPoolExecutor(max_workers=self.num_worker_threads) as executor: + results = executor.map(self._process_file, file_paths, [outfile] * len(file_paths)) + files_processed, total_size = sum([r[0] for r in results]), sum([r[1] for r in results]) + + self._write_combination_summary(outfile, files_processed, total_size) + + if self.create_zip_archive: + self._create_zip_archive() + + logging.info(f"Combined {files_processed} files into {self.output_file}") + logging.info(f"Total size: {total_size / 1024 / 1024:.2f} MB") + + print(f"\nCombined {files_processed} files into {self.output_file}") + print(f"Total size: {total_size / 1024 / 1024:.2f} MB") + + except Exception as e: + logging.error(f"An error occurred: {e}") + print(f"An error occurred: {e}") + + + + def _create_zip_archive(self): + zip_filename = self.output_file.replace('.txt', '.zip') + try: + with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + zipf.write(self.output_file, arcname=os.path.basename(self.output_file)) # Use arcname for correct filename in zip + logging.info(f"Created zip archive: {zip_filename}") + print(f"Created zip archive: {zip_filename}") + + except Exception as e: + logging.error(f"Error creating zip archive: {e}") + print(f"Error creating zip archive: {e}") + + + +def main(): + combiner = FileCombiner() + combiner.get_user_preferences() + combiner.combine_files() + +if __name__ == "__main__": + main() +================================================================================ +File: combined_files.txt +================================================================================ + +=== File Combination Summary === +Generated on: 2024-10-30 10:26:39.975657 +Source directory: C:\Users\g1n\Documents\Github\textcodecombiner +Included extensions: All +Excluded folders: .git +================================================================================ + +================================================================================ +File: combine.py +================================================================================ + +import os +import datetime +import logging +import re +import tempfile +import zipfile +from concurrent.futures import ThreadPoolExecutor +from typing import List +import threading + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +class FileCombiner: + """Combines multiple files into a single output file, with various options.""" + + def __init__(self): + self.source_dir = "." + self.output_file = "combined_files.txt" + self.extensions = [] # Empty list means all extensions are included + self.exclude_folders = ['.git'] + self.exclude_patterns = [] + self.include_line_numbers = False + self.include_timestamp = False + self.include_file_size = False + self.add_syntax_highlight = False # Requires manual language specification in output + self.max_file_size_mb = None + self.create_zip_archive = False + self.exclude_images = False # Basic image extension check, not fully reliable + self.exclude_executable = False + self.exclude_temp_and_backup_files = False + self.exclude_hidden_files = False + self.num_worker_threads = 4 + self.lock = threading.Lock() + + + def get_user_preferences(self): + """Interactively gets user preferences for file combination.""" + + print("\n=== File Combiner Configuration ===") + + self.source_dir = self._get_input("Source directory (default: .): ", self.source_dir, os.path.isdir) + self.output_file = self._get_input("Output file name (default: combined_files.txt): ", self.output_file) + self.extensions = self._get_list_input("File extensions to include (comma-separated, or Enter for all): ") + self.exclude_folders = self._get_list_input("Folders to exclude (comma-separated, default: .git): ", self.exclude_folders) + self.exclude_patterns = self._get_list_input("Regex patterns to exclude (comma-separated): ") + + self.include_line_numbers = self._get_boolean_input("Include line numbers? (y/n): ") + self.include_timestamp = self._get_boolean_input("Include timestamps? (y/n): ") + self.include_file_size = self._get_boolean_input("Include file sizes? (y/n): ") + self.add_syntax_highlight = self._get_boolean_input("Add syntax highlighting (requires manual language spec)? (y/n): ") # Clarify manual aspect + self.max_file_size_mb = self._get_float_input("Max file size to include (MB, or Enter for no limit): ") + self.create_zip_archive = self._get_boolean_input("Create zip archive of output? (y/n): ") + self.exclude_images = self._get_boolean_input("Exclude common image files (basic check, not fully reliable)? (y/n): ") + self.exclude_executable = self._get_boolean_input("Exclude executable files? (y/n): ") + self.exclude_temp_and_backup_files = self._get_boolean_input("Exclude temp/backup files? (y/n): ") + self.exclude_hidden_files = self._get_boolean_input("Exclude hidden files? (y/n): ") + self.num_worker_threads = self._get_int_input("Number of worker threads (default: 4): ", 4, lambda x: x > 0) + + + def _get_input(self, prompt: str, default: str = None, validator=lambda x: True) -> str: + while True: + value = input(prompt).strip() + if not value: + return default + if validator(value): + return value + print("Invalid input.") + + + def _get_list_input(self, prompt: str, default: list = None) -> list: + value = input(prompt).strip() + if not value: + return default or [] + return [x.strip() for x in value.split(',') if x.strip()] + + + def _get_boolean_input(self, prompt: str) -> bool: + return self._get_input(prompt + " (y/n): ", "n", lambda x: x.lower() in ('y', 'n')) == 'y' + + + def _get_float_input(self, prompt: str) -> float or None: # returns float or None for no input + while True: + value = input(prompt).strip() + if not value: + return None # Allow no input, return None + try: + return float(value) + except ValueError: + print("Invalid input. Please enter a number.") + + + def _get_int_input(self, prompt: str, default: int, validator=lambda x: True) -> int: + while True: + value = input(prompt).strip() + if not value: + return default + try: + int_value = int(value) + if validator(int_value): + return int_value + else: + print("Invalid input. Value does not meet criteria.") + except ValueError: + print("Invalid input. Please enter an integer.") + + + def should_process_file(self, filepath: str) -> bool: + """Determines whether a file should be included in the combination based on user settings.""" + + if self.extensions and not any(filepath.endswith(ext) for ext in self.extensions): + return False + if any(folder in filepath for folder in self.exclude_folders): + return False + if self.exclude_patterns and any(re.match(pattern, filepath) for pattern in self.exclude_patterns): + return False + try: + file_size = os.path.getsize(filepath) + if self.max_file_size_mb and file_size > self.max_file_size_mb * 1024 * 1024: + return False + except OSError: + logging.warning(f"Could not get size of {filepath}. Skipping.") + return False + if self.exclude_images and any(filepath.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']): + return False # Basic image file extension check. + if self.exclude_executable and os.access(filepath, os.X_OK): + return False + if self.exclude_temp_and_backup_files and (filepath.startswith(tempfile.gettempdir()) or any(filepath.endswith(ext) for ext in ['.tmp', '.temp', '.bak', '~'])): + return False + if self.exclude_hidden_files and os.path.basename(filepath).startswith('.'): + return False + + return True + + def _process_file(self, filepath: str, outfile): + """Processes and writes a single file to the output.""" + try: + with open(filepath, 'r', encoding='utf-8') as infile: + content = infile.read() + with self.lock: # Lock to prevent race conditions with multithreading + self._write_file_header(outfile, filepath) + if self.add_syntax_highlight: + ext = os.path.splitext(filepath)[1] + outfile.write(f"```{ext[1:] if ext else ''}\n") # Manual language specification + if self.include_line_numbers: + for i, line in enumerate(content.splitlines(), 1): + outfile.write(f"{i:4d} | {line}\n") + else: + outfile.write(content) + if self.add_syntax_highlight: + outfile.write("```\n") + outfile.write("\n") + + return 1, os.path.getsize(filepath) # Return file count and size + except Exception as e: + logging.error(f"Error reading {filepath}: {e}") + with self.lock: + outfile.write(f"Error reading {filepath}: {e}\n\n") + return 0, 0 # File not read, size is 0. + + def _write_summary(self, outfile): + """Writes the initial summary information to the output file.""" + outfile.write("=== File Combination Summary ===\n") + outfile.write(f"Generated on: {datetime.datetime.now()}\n") + outfile.write(f"Source directory: {os.path.abspath(self.source_dir)}\n") + + outfile.write(f"Included extensions: {', '.join(self.extensions) if self.extensions else 'All'}\n") + outfile.write(f"Excluded folders: {', '.join(self.exclude_folders)}\n") + + outfile.write("=" * 80 + "\n\n") + + def _write_file_header(self, outfile, filepath): + outfile.write("=" * 80 + "\n") + outfile.write(f"File: {os.path.relpath(filepath, self.source_dir)}\n") # Use relative path + if self.include_timestamp: + timestamp = datetime.datetime.fromtimestamp(os.path.getmtime(filepath)) + outfile.write(f"Last Modified: {timestamp}\n") + + if self.include_file_size: + size = os.path.getsize(filepath) + outfile.write(f"Size: {size / 1024:.2f} KB\n") + outfile.write("=" * 80 + "\n\n") + + + def _write_combination_summary(self, outfile, files_processed, total_size): + """Writes the final combination summary to the output file.""" + + outfile.write("=" * 80 + "\n") + outfile.write(f"Total files processed: {files_processed}\n") + outfile.write(f"Total size: {total_size / 1024 / 1024:.2f} MB\n") + + + def combine_files(self): + """Combines the files according to the user preferences.""" + + + try: + with open(self.output_file, 'w', encoding='utf-8') as outfile: + self._write_summary(outfile) + + + file_paths = [] + for dirpath, dirnames, filenames in os.walk(self.source_dir, followlinks=False): # followlinks=False added + dirnames[:] = [d for d in dirnames if d not in self.exclude_folders] # Exclude specified directories + for filename in filenames: + filepath = os.path.join(dirpath, filename) + if self.should_process_file(filepath): + file_paths.append(filepath) + + with ThreadPoolExecutor(max_workers=self.num_worker_threads) as executor: + results = executor.map(self._process_file, file_paths, [outfile] * len(file_paths)) + files_processed, total_size = sum([r[0] for r in results]), sum([r[1] for r in results]) + + self._write_combination_summary(outfile, files_processed, total_size) + + if self.create_zip_archive: + self._create_zip_archive() + + logging.info(f"Combined {files_processed} files into {self.output_file}") + logging.info(f"Total size: {total_size / 1024 / 1024:.2f} MB") + + print(f"\nCombined {files_processed} files into {self.output_file}") + print(f"Total size: {total_size / 1024 / 1024:.2f} MB") + + except Exception as e: + logging.error(f"An error occurred: {e}") + print(f"An error occurred: {e}") + + + + def _create_zip_archive(self): + zip_filename = self.output_file.replace('.txt', '.zip') + try: + with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + zipf.write(self.output_file, arcname=os.path.basename(self.output_file)) # Use arcname for correct filename in zip + logging.info(f"Created zip archive: {zip_filename}") + print(f"Created zip archive: {zip_filename}") + + except Exception as e: + logging.error(f"Error creating zip archive: {e}") + print(f"Error creating zip archive: {e}") + + + +def main(): + combiner = FileCombiner() + combiner.get_user_preferences() + combiner.combine_files() + +if __name__ == "__main__": + main() +================================================================================ +File: README.md +================================================================================ + +# 📑 Text-Code Combiner + +[![Python](https://img.shields.io/badge/Python-3.6%2B-blue)](https://www.python.org/downloads/) +[![License](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) +[![Version](https://img.shields.io/badge/version-1.0.0-blue.svg)](https://github.com/yourusername/text-code-combiner/releases) +[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/yourusername/text-code-combiner/pulls) + +A powerful Python utility that intelligently combines source code and text files into a single well-organized document. Perfect for code documentation, project analysis, and source code archiving. + +## ✨ Key Features + +### 🎯 Smart File Handling +- 📁 Recursive directory scanning +- 🔍 Multiple file format support +- 🎨 Automatic syntax highlighting +- 📊 File statistics and metrics + +### 📋 Document Organization +- 🗂️ Hierarchical file structure preservation +- 📝 Automatic content formatting +- 🏷️ Intelligent file categorization +- 📑 Table of contents generation + +### ⚙️ Customization Options +- 🎛️ Configurable output formats +- 🎯 Custom file filtering +- 📏 Size limit controls +- 🚫 Directory exclusion + +## 🚀 Quick Start + +### Prerequisites +- Python 3.6 or higher +- Any operating system (Windows/Linux/MacOS) + +### Installation + +```bash +# Clone the repository +git clone https://github.com/yourusername/text-code-combiner.git + +# Navigate to project directory +cd text-code-combiner + +# Run the combiner +python combine.py +``` + +## 💡 Usage Examples + +### Basic Usage +```bash +python combine.py +``` + +### Sample Output Structure +```plaintext +=== Document Summary === +📅 Generated: 2024-10-27 14:30:45 +📁 Source: /path/to/project +📊 Files Processed: 15 +📏 Total Size: 1.25 MB + +===================================== +📄 File: app.php +📍 Location: src/app.php +⏱️ Modified: 2024-10-27 14:25:30 +📊 Size: 1.25 KB +===================================== + +[Content with syntax highlighting] +``` + +## ⚙️ Configuration Guide + +### 1. Source Selection 📁 +- Choose target directory +- Set file type filters +- Configure depth level + +### 2. Output Customization 🎨 +- Format selection +- Metadata options +- Syntax highlighting preferences + +### 3. Processing Rules 🔧 +- Size limitations +- Exclusion patterns +- Content filters + +## 🎯 Use Cases + +- 📚 Code Documentation + - Combine related source files + - Generate readable documentation + - Create searchable code archives + +- 🔍 Code Review + - Analyze entire codebases + - Track changes across files + - Review code structure + +- 📊 Project Analysis + - Generate project reports + - Analyze code distribution + - Track project metrics + +## 🛠️ Technical Details + +### System Requirements +- 💻 CPU: Any modern processor +- 🧮 RAM: 512MB minimum +- 💾 Storage: 10MB free space +- 🐍 Python 3.6+ + +### Supported File Types +- 💻 Programming: `.php`, `.py`, `.js`, `.java`, etc. +- 📝 Text: `.txt`, `.md`, `.csv` +- 📄 Documents: `.xml`, `.json`, `.yaml` +- ⚙️ Config: `.ini`, `.conf`, `.env` + +## 🤝 Contributing + +We welcome contributions! Here's how you can help: + +1. 🍴 Fork the repository +2. 🌿 Create your feature branch (`git checkout -b feature/AmazingFeature`) +3. 💾 Commit changes (`git commit -m 'Add AmazingFeature'`) +4. 📤 Push to branch (`git push origin feature/AmazingFeature`) +5. 🔄 Open a Pull Request + +## 📝 License + +Released under MIT License. See [LICENSE](LICENSE) for details. + +================================================================================ +Total files processed: 3 +Total size: 0.00 MB diff --git a/combinefiles-x.py b/combinefiles-x.py new file mode 100644 index 0000000..29f4503 --- /dev/null +++ b/combinefiles-x.py @@ -0,0 +1,247 @@ +import os +import datetime +import logging +import re +import tempfile +import zipfile +from concurrent.futures import ThreadPoolExecutor +from typing import List, Callable, Optional +import threading + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +class FileCombiner: + """Combines multiple files into a single output file, with various options.""" + + def __init__(self): + self.source_dir = "." + self.output_file = "combined_files.txt" + self.extensions: List[str] = [] # List of file extensions to include + self.exclude_folders: List[str] = ['.git'] # List of folders to exclude + self.exclude_patterns: List[str] = [] # List of regex patterns to exclude from file paths + self.include_line_numbers: bool = False # Whether to include line numbers in the output + self.include_timestamp: bool = False # Whether to include file modification timestamp + self.include_file_size: bool = False # Whether to include file size + self.add_syntax_highlight: bool = False # Whether to add basic syntax highlighting (requires manual language spec) + self.max_file_size_mb: Optional[float] = None # Maximum file size (MB) to include + self.create_zip_archive: bool = False # Whether to create a zip archive of the output + self.exclude_images: bool = False # Whether to exclude common image files + self.exclude_executable: bool = False # Whether to exclude executable files + self.exclude_temp_and_backup_files: bool = False # Whether to exclude temp and backup files + self.exclude_hidden_files: bool = False # Whether to exclude hidden files + self.num_worker_threads: int = 4 # Number of threads to use for processing + self.lock = threading.Lock() # Lock for thread-safe file writing + + self._image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'] + self._temp_backup_extensions = ['.tmp', '.temp', '.bak', '~'] + + def get_user_preferences(self): + """Interactively gets user preferences for file combination.""" + print("\n=== File Combiner Configuration ===") + + self.source_dir = self._get_input("Source directory (default: .): ", self.source_dir, os.path.isdir) + self.output_file = self._get_input("Output file name (default: combined_files.txt): ", self.output_file) + self.extensions = self._get_list_input("File extensions to include (comma-separated, or Enter for all): ") + self.exclude_folders = self._get_list_input("Folders to exclude (comma-separated, default: .git): ", self.exclude_folders) + self.exclude_patterns = self._get_list_input("Regex patterns to exclude (comma-separated): ") + + self.include_line_numbers = self._get_boolean_input("Include line numbers? (y/n): ") + self.include_timestamp = self._get_boolean_input("Include timestamps? (y/n): ") + self.include_file_size = self._get_boolean_input("Include file sizes? (y/n): ") + self.add_syntax_highlight = self._get_boolean_input("Add syntax highlighting (requires manual language spec)? (y/n): ") + self.max_file_size_mb = self._get_float_input("Max file size to include (MB, or Enter for no limit): ") + self.create_zip_archive = self._get_boolean_input("Create zip archive of output? (y/n): ") + self.exclude_images = self._get_boolean_input("Exclude common image files (basic check, not fully reliable)? (y/n): ") + self.exclude_executable = self._get_boolean_input("Exclude executable files? (y/n): ") + self.exclude_temp_and_backup_files = self._get_boolean_input("Exclude temp/backup files? (y/n): ") + self.exclude_hidden_files = self._get_boolean_input("Exclude hidden files? (y/n): ") + self.num_worker_threads = self._get_int_input("Number of worker threads (default: 4): ", 4, lambda x: x > 0) + + def _get_input(self, prompt: str, default: Optional[str] = None, validator: Callable[[str], bool] = lambda x: True) -> str: + """Gets validated string input from the user.""" + while True: + value = input(prompt).strip() + if not value: + return default if default is not None else "" # Handle case where default is explicitly None + if validator(value): + return value + print("Invalid input.") + + def _get_list_input(self, prompt: str, default: Optional[List[str]] = None) -> List[str]: + """Gets a comma-separated list from the user.""" + value = input(prompt).strip() + if not value: + return default or [] + return [x.strip() for x in value.split(',') if x.strip()] + + def _get_boolean_input(self, prompt: str) -> bool: + """Gets a boolean response from the user.""" + return self._get_input(prompt, "n", lambda x: x.lower() in ('y', 'n')) == 'y' + + def _get_float_input(self, prompt: str) -> Optional[float]: + """Gets a float response from the user or None for no input""" + while True: + value = input(prompt).strip() + if not value: + return None + try: + return float(value) + except ValueError: + print("Invalid input. Please enter a number.") + + def _get_int_input(self, prompt: str, default: int, validator: Callable[[int], bool] = lambda x: True) -> int: + """Gets a validated integer input from the user.""" + while True: + value = input(prompt).strip() + if not value: + return default + try: + int_value = int(value) + if validator(int_value): + return int_value + else: + print("Invalid input. Value does not meet criteria.") + except ValueError: + print("Invalid input. Please enter an integer.") + + def should_process_file(self, filepath: str) -> bool: + """Determines if a file should be processed based on user settings.""" + try: + if self.extensions and not any(filepath.endswith(ext) for ext in self.extensions): + return False + if any(folder in filepath for folder in self.exclude_folders): + return False + if self.exclude_patterns and any(re.search(pattern, filepath) for pattern in self.exclude_patterns): + return False + file_size = os.path.getsize(filepath) + if self.max_file_size_mb is not None and file_size > self.max_file_size_mb * 1024 * 1024: + return False + if self.exclude_images and any(filepath.lower().endswith(ext) for ext in self._image_extensions): + return False + if self.exclude_executable and os.access(filepath, os.X_OK): + return False + if self.exclude_temp_and_backup_files and (filepath.startswith(tempfile.gettempdir()) or any(filepath.endswith(ext) for ext in self._temp_backup_extensions)): + return False + if self.exclude_hidden_files and os.path.basename(filepath).startswith('.'): + return False + return True + except OSError as e: + logging.warning(f"Could not determine if file should be processed {filepath}: {e}") + return False # If any error occurs during checking, default to not processing + + def _process_file(self, filepath: str, outfile): + """Processes and writes a single file to the output.""" + try: + with open(filepath, 'r', encoding='utf-8', errors='ignore') as infile: # Handle potential encoding errors + content = infile.read() + with self.lock: # Lock for multithreading + self._write_file_header(outfile, filepath) + if self.add_syntax_highlight: + ext = os.path.splitext(filepath)[1] + outfile.write(f"```{ext[1:] if ext else ''}\n") + if self.include_line_numbers: + for i, line in enumerate(content.splitlines(), 1): + outfile.write(f"{i:4d} | {line}\n") + else: + outfile.write(content) + if self.add_syntax_highlight: + outfile.write("```\n") + outfile.write("\n") + return 1, os.path.getsize(filepath) + except Exception as e: + logging.error(f"Error reading {filepath}: {e}") + with self.lock: + outfile.write(f"Error reading {filepath}: {e}\n\n") + return 0, 0 + + def _write_summary(self, outfile): + """Writes the initial summary information to the output file.""" + outfile.write("=== File Combination Summary ===\n") + outfile.write(f"Generated on: {datetime.datetime.now()}\n") + outfile.write(f"Source directory: {os.path.abspath(self.source_dir)}\n") + outfile.write(f"Included extensions: {', '.join(self.extensions) if self.extensions else 'All'}\n") + outfile.write(f"Excluded folders: {', '.join(self.exclude_folders)}\n") + outfile.write("=" * 80 + "\n\n") + + def _write_file_header(self, outfile, filepath): + """Writes file header information to output.""" + outfile.write("=" * 80 + "\n") + outfile.write(f"File: {os.path.relpath(filepath, self.source_dir)}\n") + if self.include_timestamp: + timestamp = datetime.datetime.fromtimestamp(os.path.getmtime(filepath)) + outfile.write(f"Last Modified: {timestamp}\n") + if self.include_file_size: + try: + size = os.path.getsize(filepath) + outfile.write(f"Size: {size / 1024:.2f} KB\n") + except OSError as e: + logging.warning(f"Could not get size of {filepath} for header: {e}") + outfile.write("=" * 80 + "\n\n") + + def _write_combination_summary(self, outfile, files_processed, total_size): + """Writes the final summary of the combination to the output file.""" + outfile.write("=" * 80 + "\n") + outfile.write(f"Total files processed: {files_processed}\n") + outfile.write(f"Total size: {total_size / 1024 / 1024:.2f} MB\n") + + def combine_files(self): + """Combines the files based on user preferences.""" + try: + with open(self.output_file, 'w', encoding='utf-8') as outfile: + self._write_summary(outfile) + file_paths = [] + for dirpath, dirnames, filenames in os.walk(self.source_dir, followlinks=False): + dirnames[:] = [d for d in dirnames if d not in self.exclude_folders] + for filename in filenames: + filepath = os.path.join(dirpath, filename) + if self.should_process_file(filepath): + file_paths.append(filepath) + + with ThreadPoolExecutor(max_workers=self.num_worker_threads) as executor: + # Submit tasks and process results as they become available to avoid holding all in memory + futures = [executor.submit(self._process_file, filepath, outfile) for filepath in file_paths] + files_processed = 0 + total_size = 0 + for future in futures: + try: + processed, size = future.result() + files_processed += processed + total_size += size + except Exception as e: + logging.error(f"Error processing a file: {e}") + + self._write_combination_summary(outfile, files_processed, total_size) + + if self.create_zip_archive: + self._create_zip_archive() + + logging.info(f"Combined {files_processed} files into {self.output_file}") + logging.info(f"Total size: {total_size / 1024 / 1024:.2f} MB") + print(f"\nCombined {files_processed} files into {self.output_file}") + print(f"Total size: {total_size / 1024 / 1024:.2f} MB") + + except Exception as e: + logging.error(f"An error occurred during file combination: {e}") + print(f"An error occurred: {e}") + + def _create_zip_archive(self): + """Creates a zip archive of the output file.""" + zip_filename = self.output_file.replace('.txt', '.zip') + try: + with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf: + zipf.write(self.output_file, arcname=os.path.basename(self.output_file)) + logging.info(f"Created zip archive: {zip_filename}") + print(f"Created zip archive: {zip_filename}") + + except Exception as e: + logging.error(f"Error creating zip archive: {e}") + print(f"Error creating zip archive: {e}") + +def main(): + combiner = FileCombiner() + combiner.get_user_preferences() + combiner.combine_files() + +if __name__ == "__main__": + main()