Source code for budou.mecabsegmenter
# -*- coding: utf-8 -*-
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MeCab based Segmenter.
Word segmenter module powered by `MeCab <https://github.com/taku910/mecab>`_.
You need to install MeCab to use this segmenter.
The easiest way to install MeCab is to run :code:`make install-mecab`. The
script will download source codes from GitHub and build the tool. It also setup
`IPAdic <https://ja.osdn.net/projects/ipadic/>`_, a standard dictionary for
Japanese.
"""
import logging
import sys
import six
from .segmenter import Segmenter
from .chunk import Chunk, ChunkList
_DEPENDENT_POS_FORWARD = set()
_DEPENDENT_POS_BACKWARD = {u'助詞', u'助動詞'}
_DEPENDENT_LABEL_FORWARD = set()
_DEPENDENT_LABEL_BACKWARD = {u'非自立'}
[docs]class MecabSegmenter(Segmenter):
"""MeCab Segmenter.
Attributes:
tagger (MeCab.Tagger): MeCab Tagger to parse the input sentence.
supported_languages (list of str): List of supported languages' codes.
"""
supported_languages = {'ja'}
def __init__(self):
try:
import MeCab
self.tagger = MeCab.Tagger('-Ochasen')
except ImportError:
logging.error(
('mecab-python3 is not installed. Install the module by running '
'`$ pip install mecab-python3`. If MeCab is not installed in your '
'system yet, run `$ make install-mecab` instead.'))
sys.exit(1)
[docs] def segment(self, source, language=None):
"""Returns a chunk list from the given sentence.
Args:
source (str): Source string to segment.
language (str, optional): A language code.
Returns:
A chunk list. (:obj:`budou.chunk.ChunkList`)
Raises:
ValueError: If :code:`language` is given and it is not included in
:code:`supported_languages`.
"""
if language and not language in self.supported_languages:
raise ValueError(
'Language {} is not supported by MeCab segmenter'.format(language))
chunks = ChunkList()
seek = 0
source_str = source.encode('utf-8') if six.PY2 else source
results = self.tagger.parse(source_str).split('\n')[:-2]
for row in results:
if six.PY2:
row = row.decode('utf-8')
token = row.split('\t')
word = token[0]
labels = token[3].split('-')
pos = labels[0]
label = labels[1] if len(labels) > 1 else None
if source[seek: seek + len(word)] != word:
assert source[seek] == ' '
assert source[seek + 1: seek + len(word) + 1] == word
chunks.append(Chunk.space())
seek += 1
dependency = None
if pos in _DEPENDENT_POS_FORWARD:
dependency = True
elif pos in _DEPENDENT_POS_BACKWARD:
dependency = False
elif label in _DEPENDENT_LABEL_FORWARD:
dependency = True
elif label in _DEPENDENT_LABEL_BACKWARD:
dependency = False
chunk = Chunk(word, pos=pos, label=label, dependency=dependency)
if chunk.is_punct():
chunk.dependency = chunk.is_open_punct()
chunks.append(chunk)
seek += len(word)
chunks.resolve_dependencies()
return chunks