基于pyhanlp,识别中文文本中的命名实体(机构、人名、地名)。
from pyhanlp import *
from jpype import JArray,JString
def convert_to_jarray(array):
jarray = JArray(JString, 1)(len(array))
for i in range(0, len(array)):
jarray[i] = array[i]
return jarray
text = "北京大学经济学院教授李四来到香港特别行政区"
CRFSegmenter = JClass("com.hankcs.hanlp.model.crf.CRFSegmenter")
segmenter = CRFSegmenter()
text_seg = segmenter.segment(text)
print(' '.join(text_seg))
CRFTagger = JClass("com.hankcs.hanlp.model.crf.CRFPOSTagger")
tagger = CRFTagger()
text_pos = tagger.tag(segmenter.segment(text))
print(' '.join(text_pos))
CRFNERecognizer = JClass("com.hankcs.hanlp.model.crf.CRFNERecognizer")
recognizer = CRFNERecognizer()
text_ne = recognizer.recognize(convert_to_jarray(text_seg),
convert_to_jarray(text_pos))
print(' '.join(text_ne))
PerceptronLexicalAnalyzer = JClass("com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer")
analyzer = PerceptronLexicalAnalyzer()
text_analyze = analyzer.analyze(text)
print(text_analyze)