import re
from docx import Document
from typing import List, Tuple
from docx.enum.text import WD_ALIGN_PARAGRAPH
from transformers import AutoTokenizer
import torch, json
from vllm import LLM, SamplingParams
import time, tiktokenclass WordContentExtractor:"""Word文档内容提取器,精确提取指定章节内容功能:1. 根据完整标题匹配提取指定章节2. 完全保留标题原始格式(包括空格)3. 返回两个合并后的列表:- final_result_paragraph: 按段落合并后的内容列表- final_result_sentence: 按句子合并后的内容列表使用示例:>>> extractor = WordContentExtractor("example.docx")>>> paragraphs, sentences = extractor.extract_contents()"""def __init__(self, file_path: str):"""初始化提取器参数:file_path -- Word文档路径"""self.file_path = file_pathtry:self.doc = Document(file_path)except Exception as e:raise ValueError(f"无法加载Word文档: {e}")self.target_titles = ['摘 要', 'Abstract', '1 绪论', '2 相关理论技术与评估指标', '3 面向药物相互作用中复杂关系推理的逻辑查询模型', '4 面向人类基因合成致死预测的可解释多跳推理模型', '5 面向药物相互作用预测的模糊逻辑查询模型', '结 论', '致 谢']self.exclude = ['参 考 文 献','作者简历及攻读硕士学位期间的科研成果','大连海事大学学位论文授权使用声明']def is_target_title(self, text: str) -> bool:"""精确判断段落是否为目标章节标题参数:text -- 段落文本返回:如果是目标章节标题返回True,否则False"""return text in self.target_titlesdef extract_contents(self) -> Tuple[List[str], List[str]]:"""精确提取指定章节内容返回:元组,包含两个列表:- final_result_paragraph: 按段落合并后的内容列表- final_result_sentence: 按句子合并后的内容列表"""final_result_paragraph = []final_result_sentence = []current_section = Nonecurrent_section_paragraphs = []exclude = 0for i, paragraph in enumerate(self.doc.paragraphs):text = paragraph.text.strip()if text in self.exclude:exclude = 1continueif self.is_target_title(text):exclude = 0if current_section is not None and current_section_paragraphs:final_result_paragraph.extend(current_section_paragraphs)section_text = ' '.join(current_section_paragraphs)sentences = re.split(r'(?<=[。!?!?])', section_text)sentences = [s.strip() for s in sentences if s.strip()]final_result_sentence.extend(sentences)current_section = textcurrent_section_paragraphs = []continueif current_section is not None and text and exclude == 0:if ' ' not in text and text[-1] in ['。', '!', '?', '!', '?', ":", ":"]:text = re.sub(r'\[\d+\]', '', text)current_section_paragraphs.append(text)if current_section is not None and current_section_paragraphs:final_result_paragraph.extend(current_section_paragraphs)section_text = ' '.join(current_section_paragraphs)sentences = re.split(r'(?<=[。!?!?])', section_text)sentences = [s.strip() for s in sentences if s.strip()]final_result_sentence.extend(sentences)return final_result_paragraph, final_result_sentencedef print_contents(self, paragraphs: List[str], sentences: List[str]):"""打印提取的内容参数:paragraphs -- 按段落提取的内容列表sentences -- 按句子提取的内容列表"""print("="*50)print("按段落提取的结果 (共{}段):".format(len(paragraphs)))for i, para in enumerate(paragraphs, 1):print(f"{i}. {para}")def llm(prompts):MODEL_PATH = r"/home/cjk_ubuntu/LLaMA-Factory/Qwen/Qwen2.5-3B-Instruct原"MODEL_PATH = r"/mnt/d/LLaMA-Factory/Qwen/Qwen2.5-7B-Instruct-AWQ"encoding = tiktoken.encoding_for_model("gpt-4")tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)sampling_params = SamplingParams(temperature=0.6, max_tokens=2048)llm = LLM(model=MODEL_PATH, max_model_len=2048)all_text = ""old_time = time.time()formatted_prompts = []for prompt in prompts:if "其中" in prompt:continuemessages = [{"role": "system", "content": "检查是否存在语病,若没有则输出‘No’,若有则输出。输出格式为'Yes\n原句:{原文句子}\n错误原因:{错误原因}。' 只针对句子的“搭配不当、成分残缺、成分赘余、语序不当、用词不当”进行分析,不分析句子是否“结构复杂、理解困难”。如果没有直接且明显的语病,一律输出No。"},{"role": "user", "content": prompt}]formatted_prompt = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)formatted_prompts.append(formatted_prompt)outputs = llm.generate(formatted_prompts, sampling_params)records = []for output in outputs:prompt = output.promptgenerated_text = output.outputs[0].textall_text += generated_textif 'Yes'.lower() in generated_text.lower() and 'No'.lower() not in generated_text.lower():if '原句:' in generated_text and '错误原因:' in generated_text:records.append({"原句": generated_text.split('原句:')[1].split('错误原因:')[0].strip(), "错误原因": generated_text.split('错误原因:')[1].strip()})with open(r"/mnt/c/Users/CJK/Desktop/record.json", "w", encoding="utf-8") as f:json.dump(records, f, ensure_ascii=False, indent=4)print(f"\nspeed={(len(encoding.encode(all_text)) / (time.time() - old_time)):.2f} token/s.\n")if __name__ == "__main__":doc_path = r"/mnt/c/users/CJK/Desktop/毕业论文/程俊凯-1120220303-基于知识图谱的药物知识发现算法研究.docx"extractor = WordContentExtractor(doc_path)paragraphs, sentences = extractor.extract_contents()print("\n提取完成:")print(f"- 共提取 {len(paragraphs)} 个段落")print(f"- 共提取 {len(sentences)} 个句子")llm(sentences)