File: ]+>([^<]+)', html_content) file_name = file_match.group(1) if file_match else None # Message content msg_match = re.search(fr'

(.*?)

', html_content, re.DOTALL) content = msg_match.group(1) if msg_match else "" content = content.replace('
', '\n') cleaned_content = clean_html(content) # Apply word truncation if limit is specified if word_limit: cleaned_content = truncate_words(cleaned_content, word_limit) post_text = f"--- Post {post_id} ---\n" if file_name: post_text += f"[File: {file_name}]\n" post_text += cleaned_content + "\n" output_buffer.append(post_text) full_output = "\n".join(output_buffer) # Print to console print(full_output) # Write to file if output_root provided if output_root: timestamp = datetime.now().strftime("%Y-%m-%d_%H") output_dir = os.path.join(output_root, f"{board}_{timestamp}") os.makedirs(output_dir, exist_ok=True) file_path = os.path.join(output_dir, f"{thread_id}.txt") try: with open(file_path, 'w', encoding='utf-8') as f: f.write(full_output) print(f"--- Saved to {file_path} ---", file=sys.stderr) except Exception as e: print(f"Error writing to file: {e}", file=sys.stderr) return full_output def show_usage(): print("Usage:") print(" python3 chan_extractor.py catalog ") print(" python3 chan_extractor.py thread [output_root_dir] [word_limit]") print("\nExamples:") print(" python3 chan_extractor.py catalog a") print(" python3 chan_extractor.py thread a 285635254 downloads 10") if __name__ == "__main__": if len(sys.argv) < 3: show_usage() sys.exit(1) cmd = sys.argv[1].lower() if cmd == "catalog": board = sys.argv[2] threads = get_catalog(board) for tid, r_count, teaser in threads: print(f"{tid}|{r_count}|{teaser}") elif cmd == "thread": if len(sys.argv) < 4: print("Error: thread_id required.") show_usage() sys.exit(1) board = sys.argv[2] thread_id = sys.argv[3] out_root = None word_limit = None if len(sys.argv) > 4: # Check if arg4 is a digit (word_limit) or a directory if sys.argv[4].isdigit(): word_limit = int(sys.argv[4]) else: out_root = sys.argv[4] if len(sys.argv) > 5 and sys.argv[5].isdigit(): word_limit = int(sys.argv[5]) get_thread(board, thread_id, out_root, word_limit) else: print(f"Unknown command: {cmd}") show_usage() sys.exit(1)