Source code for budou.tinysegmentersegmenter

# -*- coding: utf-8 -*-
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""TinySegmenter based Segmenter.

Word segmenter module powered by TinySegmenter, a compact Japanese tokenizer
originally developed by Taku Kudo. This is built on its Python port
(https://pypi.org/project/tinysegmenter3/) developed by Tatsuro Yasukawa.
"""

import logging
import sys
import six
import tinysegmenter
from .segmenter import Segmenter
from .chunk import Chunk, ChunkList

_PARTICLES = {u'か', u'かしら', u'から', u'が', u'くらい', u'けれども', u'こそ',
    u'さ', u'さえ', u'しか', u'だけ', u'だに', u'だの', u'て', u'で', u'でも',
    u'と', u'ところが', u'とも', u'な', u'など', u'なり', u'に', u'ね', u'の',
    u'ので', u'のに', u'は', u'ば', u'ばかり', u'へ', u'ほど', u'まで', u'も',
    u'や', u'やら', u'よ', u'より', u'わ', u'を'}
"""set of str: Common particles in Japanese.
Refer to https://en.wikipedia.org/wiki/Japanese_particles
"""

_AUX_VERBS = {u'です', u'でしょ', u'でし', u'ます', u'ませ', u'まし'}
""" set of str: Popylar auxiliary verbs in Japanese.
"""

[docs]def is_hiragana(word): """Checks is the word is a Japanese hiragana. This is using the unicode codepoint range for hiragana. https://en.wikipedia.org/wiki/Hiragana_(Unicode_block) Args: word (str): A word. Returns: bool: True if the word is a hiragana. """ return len(word) == 1 and 12353 <= ord(word) <= 12447
[docs]class TinysegmenterSegmenter(Segmenter): """TinySegmenter based Segmenter. Attributes: supported_languages (list of str): List of supported languages' codes. """ supported_languages = {'ja'}
[docs] def segment(self, source, language=None): """Returns a chunk list from the given sentence. Args: source (str): Source string to segment. language (str, optional): A language code. Returns: A chunk list. (:obj:`budou.chunk.ChunkList`) Raises: ValueError: If :code:`language` is given and it is not included in :code:`supported_languages`. """ if language and not language in self.supported_languages: raise ValueError( 'Language {} is not supported by NLAPI segmenter'.format(language)) chunks = ChunkList() results = tinysegmenter.tokenize(source) seek = 0 for word in results: word = word.strip() if not word: continue if source[seek: seek + len(word)] != word: assert source[seek] == ' ' assert source[seek + 1: seek + len(word) + 1] == word chunks.append(Chunk.space()) seek += 1 dependency = None if word in _PARTICLES or word in _AUX_VERBS or is_hiragana(word): dependency = False chunk = Chunk(word, dependency=dependency) if chunk.is_punct(): chunk.dependency = chunk.is_open_punct() chunks.append(chunk) seek += len(word) chunks.resolve_dependencies() return chunks