Source code for budou.mecabsegmenter

# -*- coding: utf-8 -*-
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""MeCab based Segmenter.

Word segmenter module powered by `MeCab <https://github.com/taku910/mecab>`_.
You need to install MeCab to use this segmenter.
The easiest way to install MeCab is to run :code:`make install-mecab`. The
script will download source codes from GitHub and build the tool. It also setup
`IPAdic <https://ja.osdn.net/projects/ipadic/>`_, a standard dictionary for
Japanese.
"""

import logging
import sys
import six
from .segmenter import Segmenter
from .chunk import Chunk, ChunkList

_DEPENDENT_POS_FORWARD = set()
_DEPENDENT_POS_BACKWARD = {u'助詞', u'助動詞'}
_DEPENDENT_LABEL_FORWARD = set()
_DEPENDENT_LABEL_BACKWARD = {u'非自立'}

[docs]class MecabSegmenter(Segmenter):
  """MeCab Segmenter.

  Attributes:
    tagger (MeCab.Tagger): MeCab Tagger to parse the input sentence.
    supported_languages (list of str): List of supported languages' codes.
  """

  supported_languages = {'ja'}

  def __init__(self):
    try:
      import MeCab
      self.tagger = MeCab.Tagger('-Ochasen')
    except ImportError:
      logging.error(
          ('mecab-python3 is not installed. Install the module by running '
           '`$ pip install mecab-python3`. If MeCab is not installed in your '
           'system yet, run `$ make install-mecab` instead.'))
      sys.exit(1)

[docs]  def segment(self, source, language=None):
    """Returns a chunk list from the given sentence.

    Args:
      source (str): Source string to segment.
      language (str, optional): A language code.

    Returns:
      A chunk list. (:obj:`budou.chunk.ChunkList`)

    Raises:
      ValueError: If :code:`language` is given and it is not included in
                  :code:`supported_languages`.
    """
    if language and not language in self.supported_languages:
      raise ValueError(
          'Language {} is not supported by MeCab segmenter'.format(language))

    chunks = ChunkList()
    seek = 0
    source_str = source.encode('utf-8') if six.PY2 else source
    results = self.tagger.parse(source_str).split('\n')[:-2]
    for row in results:
      if six.PY2:
        row = row.decode('utf-8')
      token = row.split('\t')
      word = token[0]
      labels = token[3].split('-')
      pos = labels[0]
      label = labels[1] if len(labels) > 1 else None
      if source[seek: seek + len(word)] != word:
        assert source[seek] == ' '
        assert source[seek + 1: seek + len(word) + 1] == word
        chunks.append(Chunk.space())
        seek += 1

      dependency = None
      if pos in _DEPENDENT_POS_FORWARD:
        dependency = True
      elif pos in _DEPENDENT_POS_BACKWARD:
        dependency = False
      elif label in _DEPENDENT_LABEL_FORWARD:
        dependency = True
      elif label in _DEPENDENT_LABEL_BACKWARD:
        dependency = False

      chunk = Chunk(word, pos=pos, label=label, dependency=dependency)
      if chunk.is_punct():
        chunk.dependency = chunk.is_open_punct()
      chunks.append(chunk)
      seek += len(word)
    chunks.resolve_dependencies()
    return chunks
Source code for budou.mecabsegmenter

budou

Navigation

Related Topics