import os import re import sys from docx import Document from docx.text.paragraph import Paragraph from docx.oxml import OxmlElement # ============================================================================= # Phase 1: Parse summary MD # ============================================================================= def parse_summary_md(content): """Parse news summary MD into structured data dict.""" data = {} lines = content.splitlines() blocks = [] current = [] blank_count = 0 for raw_line in content.splitlines(keepends=True): stripped = raw_line.strip() is_bracket_start = stripped.startswith('\uff08') or stripped.startswith('(') if stripped: if blank_count >= 1 or (is_bracket_start and current): if current: blocks.append(current) current = [] blank_count = 0 current.append(stripped) else: blank_count += 1 if current: blocks.append(current) for block in blocks: for idx, line in enumerate(block): if line.startswith('\u62df\u6295\u680f\u76ee\uff1a'): data['Category'] = line[len('\u62df\u6295\u680f\u76ee\uff1a'):].strip() elif line.startswith('\u4e8b\u4ef6\u65f6\u95f4\uff1a'): data['EventDate'] = line[len('\u4e8b\u4ef6\u65f6\u95f4\uff1a'):].strip() elif line.startswith('\u4ef7\u503c\u70b9\uff1a'): parts = [] for sub in block[idx:]: parts.append(sub[len('\u4ef7\u503c\u70b9\uff1a'):].strip() if sub.startswith('\u4ef7\u503c\u70b9\uff1a') else sub.strip()) data['ValuePoint'] = ''.join(parts).strip() break non_meta_blocks = [ b for b in blocks if b and not b[0].startswith('\u62df\u6295\u680f\u76ee') and not b[0].startswith('\u4e8b\u4ef6\u65f6\u95f4') and not b[0].startswith('\u4ef7\u503c') ] footer_blocks = [] while non_meta_blocks: last_block = non_meta_blocks[-1] last_line = ' '.join(last_block).strip() has_phone = bool(re.search(r'\d{11}', last_line)) keyword_hit = ( last_line.startswith('\uff08') or last_line.startswith('(') or last_line.endswith('\uff09') or last_line.endswith(')') or '\u6765\u6e90\uff1a' in last_line or '\u6765\u6e90:' in last_line or '\u7814\u7a76\u9662' in last_line or '\u79d1\u5b66\u9662' in last_line or '\u5355\u4f4d' in last_line or '\u4f5c\u8005' in last_line ) is_footer = has_phone or (len(last_line) <= 50 and keyword_hit) if is_footer: footer_blocks.insert(0, last_block) non_meta_blocks.pop() else: break main_blocks = non_meta_blocks if main_blocks: title_block = main_blocks[0] title_line = ' '.join(title_block) data['MainTitle'] = title_line data['SubTitle'] = '' body_blocks = main_blocks[1:] if body_blocks: data['Body'] = '\n'.join(' '.join(b) for b in body_blocks) else: data['Body'] = '' else: data['MainTitle'] = 'No title found' data['SubTitle'] = '' data['Body'] = '' data['OrgUnit'] = '' data['Author'] = '' data['Phone'] = '' data['SourceMedia'] = '' data['SourceDate'] = '' data['ReviewUnit'] = '' data['ReviewDate'] = '2026\u5e746\u67083\u65e5' data['has_footer'] = False if footer_blocks: data['has_footer'] = True footer_text = ' '.join([' '.join(b) for b in footer_blocks]).strip() footer_text = footer_text.replace('(', '\uff08').replace(')', '\uff09').replace(':', '\uff1a').replace(',', '\uff0c') segments = [s.strip() for s in re.split(r'\uff08|\uff09', footer_text) if s.strip()] for seg in segments: if '\u6765\u6e90\uff1a' in seg or '\u6765\u6e90' in seg: inner = seg.replace('\u6765\u6e90\uff1a', '').replace('\u6765\u6e90', '').strip() parts = inner.split('\uff0c') data['SourceMedia'] = parts[0].strip() if len(parts) > 0 else '' data['SourceDate'] = parts[1].strip() if len(parts) > 1 else '' elif re.search(r'\d{11}', seg): parts = seg.split('\uff0c') data['OrgUnit'] = parts[0].strip() if len(parts) > 0 else '' data['Author'] = parts[1].strip() if len(parts) > 1 else '' data['Phone'] = parts[2].strip() if len(parts) > 2 else '' elif '\u7814\u7a76\u9662' in seg or '\u79d1\u5b66\u9662' in seg or '\u4e2d\u5fc3' in seg or '\u6240' in seg: parts = seg.split('\uff0c') if data['OrgUnit']: data['ReviewUnit'] = parts[0].strip() if len(parts) > 0 else '' if len(parts) > 1 and '\u5ba1\u5b9a' in last_line: data['ReviewDate'] = parts[1].strip() else: data['OrgUnit'] = parts[0].strip() if len(parts) > 0 else '' if len(parts) > 1: data['Author'] = parts[1].strip() if len(parts) > 2: data['Phone'] = parts[2].strip() defaults = { 'Category': '', 'EventDate': '', 'ValuePoint': '', 'MainTitle': 'No title found', 'SubTitle': '', 'Body': '', } for k, v in defaults.items(): if k not in data: data[k] = v data['AuthorLine'] = _build_author_line(data) return data def _build_author_line(data): """Assemble the footer author line for summary docs.""" parts = [] if data.get('OrgUnit'): parts.append(data['OrgUnit']) if data.get('Author'): parts.append(data['Author']) if data.get('Phone'): parts.append(data['Phone']) if data.get('SourceMedia') or data.get('SourceDate'): src_parts = [] if data.get('SourceMedia'): src_parts.append('\u6765\u6e90\uff1a' + data['SourceMedia']) if data.get('SourceDate'): src_parts.append(data['SourceDate']) parts.append('\uff08' + '\uff0c'.join(src_parts) + '\uff09') if data.get('ReviewUnit'): parts.append('\uff08' + data['ReviewUnit']) if data.get('ReviewDate'): parts[-1] += '\uff0c' + data['ReviewDate'] parts[-1] += '\uff09' return '\u3001'.join(parts) # ============================================================================= # Phase 2: Parse original MD # ============================================================================= def parse_original_md(content): """Parse news original-article MD into structured data dict.""" data = { 'Title': '', 'SourceDate': '', 'SourceMedia': '', 'Author': '', 'Body': '', 'SourceLink': '', 'SerialHeader': '', } lines = [line.strip() for line in content.split('\n')] if lines and (lines[0].startswith("\u539f\u6587\u7a3f") or lines[0].startswith("\u539f\u6587")): colon_pos = lines[0].find('\uff1a') if colon_pos > 0: data['SerialHeader'] = lines[0][:colon_pos + 1] lines[0] = lines[0][colon_pos + 1:] else: data['SerialHeader'] = lines[0].rstrip('\uff1a') lines = lines[1:] lines = [l for l in lines if l] if not lines: return data data['Title'] = lines[0] body_start_idx = 1 body_end_idx = len(lines) if (len(lines) > 1 and ("\u3010\u6587/" in lines[1] or re.search(r'\d{4}-\d{2}-\d{2}', lines[1]))): line2 = lines[1] date_match = re.search(r'(\d{4}-\d{2}-\d{2}(?:\s+\d{2}:\d{2})?)', line2) if date_match: data['SourceDate'] = date_match.group(1) ROLE_WORDS = ('\u8bb0\u8005', '\u7f16\u8f91', '\u603b\u53f0', '\u7279\u7ea6', '\u64b0\u7a3f', '\u901a\u8baf\u5458', '\u8bc4\u8bba\u5458', '\u4e3b\u7ba1', '\u7f16\u5ba1') def strip_role(text): for w in ROLE_WORDS: text = re.sub(rf'\s*{re.escape(w)}\s*', '', text) return text.strip() raw = line2.replace('\u3010\u6587/', '').rstrip('\u3011').strip() slash_pos = raw.find('/') comma_pos = raw.find('\uff0c') if comma_pos >= 0 and (slash_pos < 0 or comma_pos < slash_pos): sep_pos = comma_pos elif slash_pos >= 0: sep_pos = slash_pos else: sep_pos = -1 if sep_pos >= 0: author_part = raw[:sep_pos].strip() media_part = raw[sep_pos + 1:].strip() if '/' in media_part: media_part = media_part.split('/')[-1].strip() else: MEDIA_RE = re.compile(r'(\u7f51|\u62a5|\u793e|\u53f0|\u6742\u5fd7|\u5468\u520a|\u6708\u520a)$') tokens = re.split(r'[\s\uff0c,\u3001]+', raw) for i in range(len(tokens) - 1, -1, -1): if MEDIA_RE.search(tokens[i]): media_part = ''.join(tokens[i:]).strip() author_part = ''.join(tokens[:i]).strip() break else: author_part = raw media_part = '' data['Author'] = strip_role(author_part) data['SourceMedia'] = strip_role(media_part) body_start_idx = 2 if "\u539f\u6587\u94fe\u63a5\uff1a" in lines[-1]: data['SourceLink'] = lines[-1].replace("\u539f\u6587\u94fe\u63a5\uff1a", "").strip() body_end_idx -= 1 if body_end_idx > body_start_idx: last_line = lines[body_end_idx - 1] if (last_line.startswith("\uff08") or last_line.startswith("(")) and ( last_line.endswith("\uff09") or last_line.endswith(")")): footer_text = last_line[1:-1].strip() if "\u8bb0\u8005" in footer_text: data['Author'] = footer_text if "\u603b\u53f0" in footer_text: data['SourceMedia'] = "\u4e2d\u592e\u5e7f\u64ad\u7535\u89c6\u603b\u53f0" else: data['Author'] = footer_text body_end_idx -= 1 para_lines = content.split('\n') body_para_segments = [] current_para = [] for pl in para_lines: stripped = pl.strip() if stripped: current_para.append(stripped) else: if current_para: body_para_segments.append(' '.join(current_para)) current_para = [] if current_para: body_para_segments.append(' '.join(current_para)) body_raw_lines = lines[body_start_idx:body_end_idx] body_raw_set = set(body_raw_lines) data['Body'] = '\n'.join( seg for seg in body_para_segments if any(ln in seg for ln in body_raw_set) ) data['SourceDate'] = _normalize_date(data['SourceDate']) if not data['SourceMedia']: data['SourceMedia'] = "\u673a\u5668\u4e4b\u5fc3" if not data['Author']: data['Author'] = "\u7f16\u8f91" return data def _normalize_date(date_str): """Normalize date: 2026-06-01 -> 2026\u5e746\u67083\u65e5""" if not date_str: return date_str m = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_str) if m: y, mo, d = int(m.group(1)), int(m.group(2)), int(m.group(3)) return f"{y}\u5e74{mo}\u6708{d}\u65e5" return date_str # ============================================================================= # Phase 3: Replace placeholders in DOCX template # ============================================================================= def replace_docx_template(template_path, output_path, data): """Replace {{FieldName}} placeholders in template and save as output.""" from lxml import etree W = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' WN = f'{{{W}}}' doc = Document(template_path) for paragraph in doc.paragraphs: p_xml = paragraph._p runs = p_xml.findall(f'{WN}r') all_text = ''.join(r.text or '' for r in runs) while True: matches = list(re.finditer(r'\{\{(\w+)\}\}', all_text)) if not matches: break m = matches[-1] key = m.group(1) if key not in data: break value = str(data[key]) start, end = m.start(), m.end() acc = 0 sri, eri = -1, -1 rel_start = rel_end = -1 for ri, run in enumerate(runs): rl = len(run.text or '') if start < acc + max(rl, 1) and sri == -1: sri = ri rel_start = start - acc if end - 1 < acc + max(rl, 1) and eri == -1: eri = ri rel_end = end - acc - 1 acc += rl if sri == -1 or eri == -1: break body_key = 'Body' if key == body_key and '\n' in value: parts = [p.strip() for p in value.split('\n') if p.strip()] s_text = runs[sri].text or '' if sri == eri: runs[sri].text = s_text[:rel_start] + parts[0] + s_text[rel_end + 1:] else: e_text = runs[eri].text or '' runs[sri].text = s_text[:rel_start] + parts[0] runs[eri].text = e_text[rel_end + 1:] for ri in range(sri + 1, eri): runs[ri].text = '' current_p = p_xml for part in parts[1:]: new_p = OxmlElement('w:p') pPr = p_xml.find(f'{WN}pPr') if pPr is not None: new_p.append(etree.fromstring(etree.tostring(pPr))) new_r = etree.SubElement(new_p, f'{WN}r') rPr = runs[sri].find(f'{WN}rPr') if rPr is not None: new_r.append(etree.fromstring(etree.tostring(rPr))) new_t = etree.SubElement(new_r, f'{WN}t') new_t.text = part new_t.set(f'{WN}space', 'preserve') current_p.addnext(new_p) current_p = new_p else: if sri == eri: run_text = runs[sri].text or '' runs[sri].text = run_text[:rel_start] + value + run_text[rel_end + 1:] else: s_text = runs[sri].text or '' e_text = runs[eri].text or '' runs[sri].text = s_text[:rel_start] + value runs[eri].text = e_text[rel_end + 1:] for ri in range(sri + 1, eri): runs[ri].text = '' runs = p_xml.findall(f'{WN}r') all_text = ''.join(r.text or '' for r in runs) doc.save(output_path) print(f"[OK] Saved: {output_path}") # ============================================================================= # Entry point # ============================================================================= if __name__ == "__main__": script_dir = os.path.dirname(os.path.abspath(__file__)) if len(sys.argv) > 1: md_files = [os.path.abspath(f) for f in sys.argv[1:]] else: md_files = [] for f in os.listdir(script_dir): if f.endswith('.md') and ('\u79d1\u6280\u65b0\u95fb\u539f\u6587' in f or '\u79d1\u6280\u65b0\u95fb\u6458\u8981' in f): md_files.append(os.path.join(script_dir, f)) md_files.sort() if not md_files: print("No tech-news MD files found!") sys.exit(1) for md_file in md_files: basename = os.path.basename(md_file) if "\u6458\u8981" in basename: template_file = os.path.join(script_dir, "template-summary.docx") parser = parse_summary_md else: template_file = os.path.join(script_dir, "template-original.docx") parser = parse_original_md output_file = md_file.replace(".md", ".docx") if not os.path.exists(template_file): print(f"[Skip] Template not found: {template_file}") continue if not os.path.exists(md_file): print(f"[Skip] MD file not found: {md_file}") continue with open(md_file, 'r', encoding='utf-8') as f: md_content = f.read() payload = parser(md_content) replace_docx_template(template_file, output_file, payload)