如何进行中英文分句以及对中英文进行分词

中英文分词简介

中英文分词是进行机器学习文本处理方面必须进行的一步,就相似于机器学习中图像处理的降噪,英文分词主要是将标点符号与英文单词进行分隔,中文分词就是讲每一个词组进行分隔,ex:‘我是一只程序猿’------>‘我’、‘是’、‘一只’、‘程序猿’,分词是文本处理中数据预处理的必作工做。

英文分词

英文分词咱们使用nltk中提供的WordPunctTokenizer来进行分词,nltk库仍是很好用的,详细的代码请看中文分词~
额,从新说明一下,使用WordPunctTokenizer来进行分词的话它是按照标点符号进行分词的,也就是说若是有数据: 22.13% 他会给你分红  22  .  13  %,这样的效果,咱们要的效果应该是 22.13  %。
nltk还提供一个分词函数:
import nltk sentences='hello world , what is your name? the number of the money is 22.13 % ' words=nltk.word_tokenize(sentences) print(words) import nltk sentences='hello world , what is your name? the number of the money is 22.13 % ' words=nltk.word_tokenize(sentences) print(words)

中文分词

中文分词咱们要感谢哈工大天然语言处理平台ltp,提供了pyltp模块来进行分词以及分句的处理,话很少说,这是使用说明以及下载说明
直接上中英文分词的详细代码:python

#!/usr/bin/env python  # -*- coding:utf-8 -*- import os from nltk.tokenize import WordPunctTokenizer as WPT from pyltp import Segmentor LTP_DATA_DIR = 'ltp-models/ltp_data' #ltp模型路径 cws_model_path = os.path.join(LTP_DATA_DIR,'cws.model') #分词模型路径 def fenci(): with open('分句/all_en.en','r',encoding='utf-8') as f: sentence_en=f.readlines() count=0 for i in range(len(sentence_en)): words_en=WPT().tokenize(sentence_en[i]) fen_en_sentence=' '.join(words_en) save_data(fen_en_sentence,count) with open('分句/all.zh','r',encoding='utf-8') as f: sentence_zh=f.readlines() #print(type(sentence)) segmentor=Segmentor() segmentor.load(cws_model_path) count=1 for j in range (len(sentence_zh)): words_zh=segmentor.segment(sentence_zh[j]) fen_zh_sentence=' '.join(words_zh) save_data(fen_zh_sentence,count) segmentor.release() def save_data(data,count): if count==0: #print(data) try: with open('分句/fen_all.en','a',encoding='utf-8') as f: f.write(data+'\n') except: with open('分句/fen_all.en','w',encoding='utf-8') as f: f.write('') elif count==1: # print(data) try: with open('分句/fen_all.zh', 'a') as f: f.writelines(data+'\n') except: with open('分句/fen_all.zh', 'w', encoding='utf-8')as f: f.write('') if __name__=='__main__': fenci() #!/usr/bin/env python  # -*- coding:utf-8 -*- import os from nltk.tokenize import WordPunctTokenizer as WPT from pyltp import Segmentor LTP_DATA_DIR = 'ltp-models/ltp_data' #ltp模型路径 cws_model_path = os.path.join(LTP_DATA_DIR,'cws.model') #分词模型路径 def fenci(): with open('分句/all_en.en','r',encoding='utf-8') as f: sentence_en=f.readlines() count=0 for i in range(len(sentence_en)): words_en=WPT().tokenize(sentence_en[i]) fen_en_sentence=' '.join(words_en) save_data(fen_en_sentence,count) with open('分句/all.zh','r',encoding='utf-8') as f: sentence_zh=f.readlines() #print(type(sentence)) segmentor=Segmentor() segmentor.load(cws_model_path) count=1 for j in range (len(sentence_zh)): words_zh=segmentor.segment(sentence_zh[j]) fen_zh_sentence=' '.join(words_zh) save_data(fen_zh_sentence,count) segmentor.release() def save_data(data,count): if count==0: #print(data) try: with open('分句/fen_all.en','a',encoding='utf-8') as f: f.write(data+'\n') except: with open('分句/fen_all.en','w',encoding='utf-8') as f: f.write('') elif count==1: # print(data) try: with open('分句/fen_all.zh', 'a') as f: f.writelines(data+'\n') except: with open('分句/fen_all.zh', 'w', encoding='utf-8')as f: f.write('') if __name__=='__main__': fenci()

中英文分句

不赘述,直接上代码,有啥问题是留言or加个粉?????嘿嘿嘿,英文分句使用的是nltk模块,中文分句时使用的哈工大的pyltp模块web

#!/usr/bin/env python  # -*- coding:utf-8 -*- import nltk import nltk.data import os from os import path from pyltp import SentenceSplitter as SS import time def get_data(): for mainfile,subfile,filename in os.walk('abstract'): #mainfile是详细的文件夹路径,subfile显示了有哪几个文件夹,filename是文件夹中文件名称 #for i in range(len(filename)): for i in range(len(filename)): if path.splitext(mainfile+filename[i])[1]=='.zh': count=1 with open(mainfile+'/'+filename[i],'r',encoding='utf-8')as f_zh: file_en_name=mainfile+'/'+filename[i] print(file_en_name) content_zh=f_zh.read() zh_sen=[] sentences_zh= SS.split(content_zh) for i in range(len(sentences_zh)): try: zh_sen.append(sentences_zh[i]) except: pass #print(sentences_zh.) save_sentence_zh = '\n'.join(zh_sen) # print('{0}/{1}'.format(mainfile, filename[i])) # print('1') save_data(save_sentence_zh,count) # print(save_sentence_zh) count=0 #file_en_name = path.splitext((mainfile +'/'+ filename[i])) file_en_name = list(file_en_name) #print(file_en_name) file_en_name[-1] = 'n' file_en_name[-2] = 'e' file_en_name = ''.join(file_en_name) print(file_en_name) with open(file_en_name, 'r', encoding='utf-8')as f_en: #print(file_en_name) content_en = f_en.read() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences_en = tokenizer.tokenize(content_en) # print(type(sentences_en)) save_data(sentences_en, count) # print(sentences_en) #time.sleep(60) def save_data(data,count): if count==0: #print(data) try: for line in data: with open('分句/all.en','a',encoding='utf-8') as f: f.write(line + '\n') except: with open('分句/all.en','w',encoding='utf-8') as f: f.write('') elif count==1: # print(data) try: with open('分句/all.zh', 'a') as f: f.writelines(data) except: with open('分句/all.zh', 'w', encoding='utf-8')as f: f.write('') if __name__ == '__main__': get_data() print('都存完了') #!/usr/bin/env python  # -*- coding:utf-8 -*- import nltk import nltk.data import os from os import path from pyltp import SentenceSplitter as SS import time def get_data(): for mainfile,subfile,filename in os.walk('abstract'): #mainfile是详细的文件夹路径,subfile显示了有哪几个文件夹,filename是文件夹中文件名称 #for i in range(len(filename)): for i in range(len(filename)): if path.splitext(mainfile+filename[i])[1]=='.zh': count=1 with open(mainfile+'/'+filename[i],'r',encoding='utf-8')as f_zh: file_en_name=mainfile+'/'+filename[i] print(file_en_name) content_zh=f_zh.read() zh_sen=[] sentences_zh= SS.split(content_zh) for i in range(len(sentences_zh)): try: zh_sen.append(sentences_zh[i]) except: pass #print(sentences_zh.) save_sentence_zh = '\n'.join(zh_sen) # print('{0}/{1}'.format(mainfile, filename[i])) # print('1') save_data(save_sentence_zh,count) # print(save_sentence_zh) count=0 #file_en_name = path.splitext((mainfile +'/'+ filename[i])) file_en_name = list(file_en_name) #print(file_en_name) file_en_name[-1] = 'n' file_en_name[-2] = 'e' file_en_name = ''.join(file_en_name) print(file_en_name) with open(file_en_name, 'r', encoding='utf-8')as f_en: #print(file_en_name) content_en = f_en.read() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences_en = tokenizer.tokenize(content_en) # print(type(sentences_en)) save_data(sentences_en, count) # print(sentences_en) #time.sleep(60) def save_data(data,count): if count==0: #print(data) try: for line in data: with open('分句/all.en','a',encoding='utf-8') as f: f.write(line + '\n') except: with open('分句/all.en','w',encoding='utf-8') as f: f.write('') elif count==1: # print(data) try: with open('分句/all.zh', 'a') as f: f.writelines(data) except: with open('分句/all.zh', 'w', encoding='utf-8')as f: f.write('') if __name__ == '__main__': get_data() print('都存完了')

有啥问题请不吝赐教,留言就行啦app