Source code for budou.chunk

# -*- coding: utf-8 -*-
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Chunk module as a unit of word segment with helpers.
"""

import collections
from xml.etree import ElementTree as ET
import unicodedata
import html5lib

[docs]class Chunk:
  """A unit for word segmentation.

  Attributes:
    word (str): Surface word of the chunk.
    pos (str, optional): Part of speech.
    label (str, optional): Label information.
    dependency (bool, optional): Dependency to neighbor words.
        :code:`None` for no dependency, :code:`True` for dependency to the
        following word, and :code:`False` for the dependency to the previous
        word.

  Args:
    word (str): Surface word of the chunk.
    pos (str, optional): Part of speech.
    label (str, optional): Label information.
    dependency (bool, optional): Dependency to neighbor words.
        :code:`None` for no dependency, :code:`True` for dependency to the
        following word, and :code:`False` for the dependency to the previous
        word.
  """
  _SPACE_POS = 'SPACE'
  _BREAK_POS = 'BREAK'

  def __init__(self, word, pos=None, label=None, dependency=None):
    self.word = word
    self.pos = pos
    self.label = label
    self.dependency = dependency

  def __repr__(self):
    return 'Chunk(%s, %s, %s, %s)' % (
        repr(self.word), self.pos, self.label, self.dependency)

[docs]  @classmethod
  def space(cls):
    """Creates space Chunk.

    Returns:
      A chunk (:obj:`budou.chunk.Chunk`)
    """
    chunk = cls(u' ', cls._SPACE_POS)
    return chunk

[docs]  @classmethod
  def breakline(cls):
    """Creates breakline Chunk.

    Returns:
      A chunk (:obj:`budou.chunk.Chunk`)
    """
    chunk = cls(u'\n', cls._BREAK_POS)
    return chunk

[docs]  def serialize(self):
    """Returns serialized chunk data in dictionary."""
    return {
        'word': self.word,
        'pos': self.pos,
        'label': self.label,
        'dependency': self.dependency,
        'has_cjk': self.has_cjk(),
    }

[docs]  def is_space(self):
    """Whether the chunk is a space.

    Returns:
      bool: True if it is a space.
    """
    return self.pos == self._SPACE_POS

[docs]  def is_punct(self):
    """Whether the chunk is a punctuation mark.

    See also https://en.wikipedia.org/wiki/Unicode_character_property

    Returns:
      bool: True if it is a punctuation mark.
    """
    return len(self.word) == 1 and unicodedata.category(self.word)[0] == 'P'

[docs]  def is_open_punct(self):
    """Whether the chunk is an open punctuation mark.

    Ps: Punctuation, open (e.g. opening bracket characters)
    Pi: Punctuation, initial quote (e.g. opening quotation mark)
    See also https://en.wikipedia.org/wiki/Unicode_character_property

    Returns:
      bool: True if it is an open punctuation mark.
    """
    return self.is_punct() and unicodedata.category(self.word) in {'Ps', 'Pi'}

[docs]  def has_cjk(self):
    """Checks if the word of the chunk contains CJK characters.

    This is using unicode codepoint ranges from
    https://github.com/nltk/nltk/blob/develop/nltk/tokenize/util.py#L149

    Returns:
      bool: True if the chunk has any CJK character.
    """
    cjk_codepoint_ranges = [
        (4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215),
        (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
    for char in self.word:
      if any([start <= ord(char) <= end
              for start, end in cjk_codepoint_ranges]):
        return True
    return False


[docs]class ChunkList(collections.MutableSequence):
  """List of :obj:`budou.chunk.Chunk` with some helpers.

  This list accepts only instances of :obj:`budou.chunk.Chunk`.

  Example:
    .. code-block:: python

       from budou.chunk import Chunk, ChunkList
       chunks = ChunkList(Chunk('abc'), Chunk('def'))
       chunks.append(Chunk('ghi'))  # OK
       chunks.append('jkl')         # NG

  Args:
    args (list of :obj:`budou.chunk.Chunk`): Initial values included in the
        list.
  """

  def __init__(self, *args):
    self.list = list()
    self.extend(list(args))

  def _check(self, val):
    """Checks if the value is an instance of :obj:`budou.chunk.Chunk`.

    Args:
      val (:obj:`budou.chunk.Chunk`): input to check

    Raises:
      TypeError: If :code:`val` is not an instance of :obj:`budou.chunk.Chunk`.
    """
    if not isinstance(val, Chunk):
      raise TypeError

  def __len__(self):
    return len(self.list)

  def __getitem__(self, i):
    return self.list[i]

  def __delitem__(self, i):
    del self.list[i]

  def __setitem__(self, i, v):
    self._check(v)
    self.list[i] = v

[docs]  def insert(self, index, value):
    self._check(value)
    self.list.insert(index, value)

[docs]  def get_overlaps(self, offset, length):
    """Returns chunks overlapped with the given range.

    Args:
      offset (int): Begin offset of the range.
      length (int): Length of the range.

    Returns:
      Overlapped chunks. (:obj:`budou.chunk.ChunkList`)
    """
    # In case entity's offset points to a space just before the entity.
    if ''.join([chunk.word for chunk in self])[offset] == ' ':
      offset += 1
    index = 0
    result = ChunkList()
    for chunk in self:
      if offset < index + len(chunk.word) and index < offset + length:
        result.append(chunk)
      index += len(chunk.word)
    return result

[docs]  def swap(self, old_chunks, new_chunk):
    """Swaps old consecutive chunks with new chunk.

    Args:
      old_chunks (:obj:`budou.chunk.ChunkList`): List of consecutive Chunks to
          be removed.
      new_chunk (:obj:`budou.chunk.Chunk`): A Chunk to be inserted.
    """
    indexes = [self.index(chunk) for chunk in old_chunks]
    del self[indexes[0]:indexes[-1] + 1]
    self.insert(indexes[0], new_chunk)

[docs]  def resolve_dependencies(self):
    """Resolves chunk dependency by concatenating them.
    """
    self._concatenate_inner(True)
    self._concatenate_inner(False)
    self._insert_breaklines()

  def _concatenate_inner(self, direction):
    """Concatenates chunks based on each chunk's dependency.

    Args:
      direction (bool): Direction of concatenation process. True for forward.
    """
    tmp_bucket = []
    source_chunks = self if direction else self[::-1]
    target_chunks = ChunkList()
    for chunk in source_chunks:
      if (
          # if the chunk has matched dependency, do concatenation.
          chunk.dependency == direction or
          # if the chunk is SPACE, concatenate to the previous chunk.
          (direction is False and chunk.is_space())
          ):
        tmp_bucket.append(chunk)
        continue
      tmp_bucket.append(chunk)
      if not direction:
        tmp_bucket = tmp_bucket[::-1]
      new_word = ''.join([tmp_chunk.word for tmp_chunk in tmp_bucket])
      new_chunk = Chunk(new_word, pos=chunk.pos, label=chunk.label,
                        dependency=chunk.dependency)
      target_chunks.append(new_chunk)
      tmp_bucket = ChunkList()
    if tmp_bucket:
      target_chunks += tmp_bucket
    if not direction:
      target_chunks = target_chunks[::-1]
    self.list = target_chunks

  def _insert_breaklines(self):
    """Inserts a breakline instead of a trailing space if the chunk is in CJK.
    """
    target_chunks = ChunkList()
    for chunk in self:
      if chunk.word[-1] == ' ' and chunk.has_cjk():
        chunk.word = chunk.word[:-1]
        target_chunks.append(chunk)
        target_chunks.append(chunk.breakline())
      else:
        target_chunks.append(chunk)
    self.list = target_chunks

[docs]  def html_serialize(self, attributes, max_length=None):
    """Returns concatenated HTML code with SPAN tag.

    Args:
      attributes (dict): A map of name-value pairs for attributes of output
          SPAN tags.
      max_length (int, optional): Maximum length of span enclosed chunk.

    Returns:
      The organized HTML code. (str)
    """
    doc = ET.Element('span')
    for chunk in self:
      if (chunk.has_cjk() and
          not (max_length and len(chunk.word) > max_length)):
        ele = ET.Element('span')
        ele.text = chunk.word
        for key, val in attributes.items():
          ele.attrib[key] = val
        doc.append(ele)
      else:
        # add word without span tag for non-CJK text (e.g. English)
        # by appending it after the last element
        if doc.getchildren():
          if doc.getchildren()[-1].tail is None:
            doc.getchildren()[-1].tail = chunk.word
          else:
            doc.getchildren()[-1].tail += chunk.word
        else:
          if doc.text is None:
            doc.text = chunk.word
          else:
            doc.text += chunk.word
    result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
    result = html5lib.serialize(
        html5lib.parseFragment(result), sanitize=True,
        quote_attr_values='always')
    return result
Source code for budou.chunk

budou

Navigation

Related Topics