网站建设方案报价_四川省建设厅网上查询_网络seo外包_搜索引擎排名营销

import re
from docx import Document
from typing import List, Tuple
from docx.enum.text import WD_ALIGN_PARAGRAPH
from transformers import AutoTokenizer
import torch, json# import torch._inductor.lowering
# torch._inductor.lowering.should_fallback_max_pool2d_with_indicesfrom vllm import LLM, SamplingParams
import time, tiktokenclass WordContentExtractor:"""Word文档内容提取器，精确提取指定章节内容功能：1. 根据完整标题匹配提取指定章节2. 完全保留标题原始格式（包括空格）3. 返回两个合并后的列表：- final_result_paragraph: 按段落合并后的内容列表- final_result_sentence: 按句子合并后的内容列表使用示例：>>> extractor = WordContentExtractor("example.docx")>>> paragraphs, sentences = extractor.extract_contents()"""def __init__(self, file_path: str):"""初始化提取器参数：file_path -- Word文档路径"""self.file_path = file_pathtry:self.doc = Document(file_path)except Exception as e:raise ValueError(f"无法加载Word文档: {e}")# 需要提取的完整章节标题（保留原始空格）self.target_titles = ['摘    要', 'Abstract', '1  绪论', '2  相关理论技术与评估指标', '3  面向药物相互作用中复杂关系推理的逻辑查询模型', '4  面向人类基因合成致死预测的可解释多跳推理模型', '5  面向药物相互作用预测的模糊逻辑查询模型', '结    论', '致    谢']self.exclude = ['参 考 文 献','作者简历及攻读硕士学位期间的科研成果','大连海事大学学位论文授权使用声明']def is_target_title(self, text: str) -> bool:"""精确判断段落是否为目标章节标题参数：text -- 段落文本返回：如果是目标章节标题返回True，否则False"""return text in self.target_titlesdef extract_contents(self) -> Tuple[List[str], List[str]]:"""精确提取指定章节内容返回：元组，包含两个列表：- final_result_paragraph: 按段落合并后的内容列表- final_result_sentence: 按句子合并后的内容列表"""final_result_paragraph = []final_result_sentence = []current_section = Nonecurrent_section_paragraphs = []exclude = 0for i, paragraph in enumerate(self.doc.paragraphs):text = paragraph.text.strip()if text in self.exclude:exclude = 1continue# 检查是否是目标章节标题if self.is_target_title(text):exclude = 0# 处理之前章节的内容if current_section is not None and current_section_paragraphs:# 添加到段落结果final_result_paragraph.extend(current_section_paragraphs)# 处理为句子section_text = ' '.join(current_section_paragraphs)sentences = re.split(r'(?<=[。！？!?])', section_text)sentences = [s.strip() for s in sentences if s.strip()]final_result_sentence.extend(sentences)# 开始新章节current_section = textcurrent_section_paragraphs = []continue# 如果是当前章节的正文内容if current_section is not None and text and exclude == 0:# if paragraph.alignment != WD_ALIGN_PARAGRAPH.CENTER:# if (('表' in text or '图' in text or 'Figure' in text or 'Table' in text) and '  ' in text) == False:if '  ' not in text and text[-1] in ['。', '！', '？', '!', '?', "：", ":"]:text = re.sub(r'\[\d+\]', '', text)current_section_paragraphs.append(text)# 处理最后一个章节if current_section is not None and current_section_paragraphs:final_result_paragraph.extend(current_section_paragraphs)section_text = ' '.join(current_section_paragraphs)sentences = re.split(r'(?<=[。！？!?])', section_text)sentences = [s.strip() for s in sentences if s.strip()]final_result_sentence.extend(sentences)return final_result_paragraph, final_result_sentencedef print_contents(self, paragraphs: List[str], sentences: List[str]):"""打印提取的内容参数：paragraphs -- 按段落提取的内容列表sentences -- 按句子提取的内容列表"""print("="*50)print("按段落提取的结果 (共{}段):".format(len(paragraphs)))for i, para in enumerate(paragraphs, 1):print(f"{i}. {para}")# print("\n" + "="*50)# print("按句子提取的结果 (共{}句):".format(len(sentences)))# for i, sent in enumerate(sentences, 1):#     print(f"{i}. {sent}")def llm(prompts):MODEL_PATH = r"/home/cjk_ubuntu/LLaMA-Factory/Qwen/Qwen2.5-3B-Instruct原"MODEL_PATH = r"/mnt/d/LLaMA-Factory/Qwen/Qwen2.5-7B-Instruct-AWQ"# 获取tiktoken编码encoding = tiktoken.encoding_for_model("gpt-4")# import sys# sys.path.append(r"D:\Anaconda3\envs\qwen_new\Lib\site-packages\vllm")# print(sys.path)# Initialize the tokenizertokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)# Pass the default decoding hyperparameters of Qwen2.5-7B-Instruct# max_tokens is for the maximum length for generation.sampling_params = SamplingParams(temperature=0.6, max_tokens=2048)# Input the model name or path. Can be GPTQ or AWQ models.llm = LLM(model=MODEL_PATH, max_model_len=2048)all_text = ""old_time = time.time()formatted_prompts = []# prompts = prompts[:30]for prompt in prompts:if "其中" in prompt:continuemessages = [{"role": "system", "content": "检查是否存在语病，若没有则输出‘No’，若有则输出。输出格式为'Yes\n原句：{原文句子}\n错误原因：{错误原因}。' 只针对句子的“搭配不当、成分残缺、成分赘余、语序不当、用词不当”进行分析，不分析句子是否“结构复杂、理解困难”。如果没有直接且明显的语病，一律输出No。"},{"role": "user", "content": prompt}]formatted_prompt = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)formatted_prompts.append(formatted_prompt)# generate outputsoutputs = llm.generate(formatted_prompts, sampling_params)# Print the outputs.records = []for output in outputs:prompt = output.promptgenerated_text = output.outputs[0].textall_text += generated_textif 'Yes'.lower() in generated_text.lower() and 'No'.lower() not in generated_text.lower():# print(f"{generated_text}")if '原句：' in generated_text and '错误原因：' in generated_text:records.append({"原句": generated_text.split('原句：')[1].split('错误原因：')[0].strip(), "错误原因": generated_text.split('错误原因：')[1].strip()})with open(r"/mnt/c/Users/CJK/Desktop/record.json", "w", encoding="utf-8") as f:json.dump(records, f, ensure_ascii=False, indent=4)print(f"\nspeed={(len(encoding.encode(all_text)) / (time.time() - old_time)):.2f} token/s.\n")if __name__ == "__main__":doc_path = r"/mnt/c/users/CJK/Desktop/毕业论文/程俊凯-1120220303-基于知识图谱的药物知识发现算法研究.docx"extractor = WordContentExtractor(doc_path)paragraphs, sentences = extractor.extract_contents()# 打印结果摘要print("\n提取完成:")print(f"- 共提取 {len(paragraphs)} 个段落")print(f"- 共提取 {len(sentences)} 个句子")llm(sentences)# # 打印详细内容# extractor.print_contents(paragraphs, sentences)
网站建设方案报价_四川省建设厅网上查询_网络seo外包_搜索引擎排名营销

最新新闻

热搜词