# -*- coding: utf-8 -*-
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TinySegmenter based Segmenter.
Word segmenter module powered by TinySegmenter, a compact Japanese tokenizer
originally developed by Taku Kudo. This is built on its Python port
(https://pypi.org/project/tinysegmenter3/) developed by Tatsuro Yasukawa.
"""
import logging
import sys
import six
import tinysegmenter
from .segmenter import Segmenter
from .chunk import Chunk, ChunkList
_PARTICLES = {u'か', u'かしら', u'から', u'が', u'くらい', u'けれども', u'こそ',
u'さ', u'さえ', u'しか', u'だけ', u'だに', u'だの', u'て', u'で', u'でも',
u'と', u'ところが', u'とも', u'な', u'など', u'なり', u'に', u'ね', u'の',
u'ので', u'のに', u'は', u'ば', u'ばかり', u'へ', u'ほど', u'まで', u'も',
u'や', u'やら', u'よ', u'より', u'わ', u'を'}
"""set of str: Common particles in Japanese.
Refer to https://en.wikipedia.org/wiki/Japanese_particles
"""
_AUX_VERBS = {u'です', u'でしょ', u'でし', u'ます', u'ませ', u'まし'}
""" set of str: Popylar auxiliary verbs in Japanese.
"""
[docs]def is_hiragana(word):
"""Checks is the word is a Japanese hiragana.
This is using the unicode codepoint range for hiragana.
https://en.wikipedia.org/wiki/Hiragana_(Unicode_block)
Args:
word (str): A word.
Returns:
bool: True if the word is a hiragana.
"""
return len(word) == 1 and 12353 <= ord(word) <= 12447
[docs]class TinysegmenterSegmenter(Segmenter):
"""TinySegmenter based Segmenter.
Attributes:
supported_languages (list of str): List of supported languages' codes.
"""
supported_languages = {'ja'}
[docs] def segment(self, source, language=None):
"""Returns a chunk list from the given sentence.
Args:
source (str): Source string to segment.
language (str, optional): A language code.
Returns:
A chunk list. (:obj:`budou.chunk.ChunkList`)
Raises:
ValueError: If :code:`language` is given and it is not included in
:code:`supported_languages`.
"""
if language and not language in self.supported_languages:
raise ValueError(
'Language {} is not supported by NLAPI segmenter'.format(language))
chunks = ChunkList()
results = tinysegmenter.tokenize(source)
seek = 0
for word in results:
word = word.strip()
if not word:
continue
if source[seek: seek + len(word)] != word:
assert source[seek] == ' '
assert source[seek + 1: seek + len(word) + 1] == word
chunks.append(Chunk.space())
seek += 1
dependency = None
if word in _PARTICLES or word in _AUX_VERBS or is_hiragana(word):
dependency = False
chunk = Chunk(word, dependency=dependency)
if chunk.is_punct():
chunk.dependency = chunk.is_open_punct()
chunks.append(chunk)
seek += len(word)
chunks.resolve_dependencies()
return chunks