template = """I want you to act as an translator who good at\ English and Chinese. I will give you a text in which there one Chinese\ paragraph and one English paragraph. I need you to seperate the English\ paragraph into sentences, and also seperate the Chinese paragras into\ sentences according to the meaning of each English sentence. Remember that not change any word of paragpraphs but only seperate the paragraphs. {format_instructions} text: {text}"""
Chinese_sentences= ResponseSchema(name="Chinese_sentences", type = 'list', description = 'add each Chiense sentences into this list') English_sentences= ResponseSchema(name="English_sentences", type = 'list',description = 'add each English sentence into this list')response_schemas = [Chinese_sentences,English_sentences] output_parser = StructuredOutputParser.from_response_schemas(response_schemas) format_instructions = output_parser.get_format_instructions()
aligned_sentences= ResponseSchema(name="aligned_sentences", description = 'add each Chiense and English sentence pair into here as a key-value pair') response_schemas = [aligned_sentences]
'```json\n{\n "aligned_sentences": {\n "第二,我们必须复苏经济,推动实现更加强劲、绿色、健康的全球发展。": "Second, we must revitalize the economy and promote a more robust, green, and healthy global development.",\n "发展是实现人民幸福的关键。": "Development is the key to achieving the well-being of the people.",\n "面对疫情带来的严重冲击,我们要共同推动全球发展迈向平衡协调包容新阶段。": "Faced with the severe impact of the pandemic, we must jointly propel global development towards a new stage of balance, coordination, and inclusiveness.",\n "在此,我愿提出全球发展倡议:": "Here, I propose a Global Development Initiative:",\n "——坚持发展优先。": "- Adhere to development priority:",\n "将发展置于全球宏观政策框架的突出位置,加强主要经济体政策协调,保持连续性、稳定性、可持续性,构建更加平等均衡的全球发展伙伴关系,推动多边发展合作进程协同增效,加快落实联合国2030年可持续发展议程。": "Place development in a prominent position within the global macro-policy framework, strengthen policy coordination among major economies, maintain continuity, stability, and sustainability, build a more equal and balanced global development partnership, promote the coordinated and efficient progress of multilateral development cooperation processes, and accelerate the implementation of the United Nations 2030 Agenda for Sustainable Development.",\n "——坚持以人民为中心。": "- Adhere to people-centered development:",\n "在发展中保障和改善民生,保护和促进人权,做到发展为了人民、发展依靠人民、发展成果由人民共享,不断增强民众的幸福感、获得感、安全感,实现人的全面发展。": "In the course of development, ensure and improve people\'s well-being, protect and promote human rights, ensure that development is for the people, relies on the people, and the benefits of development are shared by the people, constantly enhance the happiness, sense of gain, and sense of security of the people, and achieve comprehensive human development.",\n "——坚持普惠包容。": "- Adhere to inclusive development:",\n "关注发展中国家特殊需求,通过缓债、发展援助等方式支持发展中国家尤其是困难特别大的脆弱国家,着力解决国家间和各国内部发展不平衡、不充分问题。": "Pay attention to the special needs of developing countries, support developing countries, especially those facing particular difficulties, with debt relief, development assistance, and other means, focus on addressing the issues of imbalances and inadequacies in development among and within countries.",\n "——坚持创新驱动。": "- Adhere to innovation-driven development:",\n "抓住新一轮科技革命和产业变革的历史性机遇,加速科技成果向现实生产力转化,打造开放、公平、公正、非歧视的科技发展环境,挖掘疫后经济增长新动能,携手实现跨越发展。": "Seize the historic opportunities of the new round of technological revolution and industrial transformation, accelerate the transformation of scientific achievements into real productivity, create an open, fair, just, and non-discriminatory environment for technological development, explore new impetus for post-pandemic economic growth, and work together to achieve leapfrog development.",\n "——坚持人与自然和谐共生。": "- Adhere to harmonious coexistence between humans and nature:",\n "完善全球环境治理,积极应对气候变化,构建人与自然生命共同体。": "Improve global environmental governance, actively address climate change, and build a community of life for all living things.",\n "加快绿色低碳转型,实现绿色复苏发展。": "Accelerate the green and low-carbon transformation, achieve green recovery and development.",\n ......}\n}\n```
import os from langchain.output_parsers import StructuredOutputParser, ResponseSchema from langchain.prompts import ChatPromptTemplate, PromptTemplate from langchain.chat_models import ChatOpenAI import openai import docx import re import pandas as pd from itertools import zip_longest import jieba import re import json
for paragraph in doc.paragraphs: text = paragraph.text.strip() if re.search('[\u4e00-\u9fff]', text): # 匹配中文字符 chinese_paragraphs.append(text) else: english_paragraphs.append(text)
#connect each line into paragraphs chinese_paragraphs = ''.join(chinese_paragraphs) english_paragraphs = ''.join(english_paragraphs)
defsplit_sentences(text): components = re.split('([。!?.!?::])', text) # 使用正则表达式分割句子 sentences = ["".join(components[i:i+2]) for i inrange(0, len(components)-1, 2)] # 将句子和标点符号重新组合 return sentences
# connect the Chinese sentence list and the English paragraph as input text Chinese_sentences = split_sentences(chinese_paragraphs) input_text = str(Chinese_sentences)+'\n'+english_paragraphs
# load the api_key from dotenv import load_dotenv, find_dotenv _ = load_dotenv(find_dotenv()) # read local .env file openai.api_key = os.environ['OPENAI_API_KEY']
# account for deprecation of LLM model import datetime current_date = datetime.datetime.now().date() target_date = datetime.date(2024, 6, 12) if current_date > target_date: llm_model = "gpt-3.5-turbo" else: llm_model = "gpt-3.5-turbo-0301"
# structured output template aligned_sentences= ResponseSchema(name="aligned_sentences", description = 'add each Chiense and English sentence pair into here as a key-value pair') response_schemas = [aligned_sentences]
# output_parser to parse gpt's response output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
# format the structured output template format_instructions = output_parser.get_format_instructions() format_instructions
import os from langchain.output_parsers import StructuredOutputParser, ResponseSchema from langchain.prompts import ChatPromptTemplate, PromptTemplate from langchain.chat_models import ChatOpenAI import openai from docx import Document import re import pandas as pd from itertools import zip_longest import streamlit as st from dotenv import load_dotenv, find_dotenv
st.set_page_config(page_title="Bilingual Alignment", page_icon="📖") st.markdown("# Bilingual Alignment") st.sidebar.header("Bilingual Alignment") st.write( """ You can upload your bilingual doc(zh_EN or en_US). This app will help you to align Chinese and English text and the last output an excel file """ )
defwrite_excel(Chinese_sentence_list, English_sentence_list, base_filename): st.write("正在写入excel文件") zipped = zip_longest(English_sentence_list, Chinese_sentence_list) # create a DataFrame df = pd.DataFrame(zipped, columns = ["英文","中文"]) # write it into a excel file excel_filename = f'{base_filename}_output.xlsx' df.to_excel(excel_filename, index=False)
defmain(): uploaded_file = st.file_uploader("上传中英文双语Word文档", type=["docx"]) if uploaded_file isnotNone: # get the name of the uploaded file uploaded_filename = uploaded_file.name # remove the extension of the name base_filename = os.path.splitext(uploaded_filename)[0] input_text = extract_paragraphs(uploaded_file) st.write("中英文本") st.write(input_text) template, format_instructions, output_parser = prompt_template() Chinese_sentence_list, English_sentence_list = call_gpt(template,input_text,format_instructions,output_parser) write_excel(Chinese_sentence_list, English_sentence_list, base_filename)