Source code for budou.chunk

# -*- coding: utf-8 -*-
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Chunk module as a unit of word segment with helpers.
"""

import collections
from xml.etree import ElementTree as ET
import unicodedata
import html5lib
from html5lib import getTreeWalker
from html5lib.filters import sanitizer
from html5lib.constants import namespaces


[docs]class Chunk:
  """A unit for word segmentation.

  Attributes:
    word (str): Surface word of the chunk.
    pos (str, optional): Part of speech.
    label (str, optional): Label information.
    dependency (bool, optional): Dependency to neighbor words.
        :code:`None` for no dependency, :code:`True` for dependency to the
        following word, and :code:`False` for the dependency to the previous
        word.

  Args:
    word (str): Surface word of the chunk.
    pos (str, optional): Part of speech.
    label (str, optional): Label information.
    dependency (bool, optional): Dependency to neighbor words.
        :code:`None` for no dependency, :code:`True` for dependency to the
        following word, and :code:`False` for the dependency to the previous
        word.
  """
  _SPACE_POS = 'SPACE'
  _BREAK_POS = 'BREAK'

  def __init__(self, word, pos=None, label=None, dependency=None):
    self.word = word
    self.pos = pos
    self.label = label
    self.dependency = dependency

  def __repr__(self):
    return 'Chunk(%s, %s, %s, %s)' % (
        repr(self.word), self.pos, self.label, self.dependency)

[docs]  @classmethod
  def space(cls):
    """Creates space Chunk.

    Returns:
      A chunk (:obj:`budou.chunk.Chunk`)
    """
    chunk = cls(u' ', cls._SPACE_POS)
    return chunk

[docs]  @classmethod
  def breakline(cls):
    """Creates breakline Chunk.

    Returns:
      A chunk (:obj:`budou.chunk.Chunk`)
    """
    chunk = cls(u'\n', cls._BREAK_POS)
    return chunk

[docs]  def serialize(self):
    """Returns serialized chunk data in dictionary."""
    return {
        'word': self.word,
        'pos': self.pos,
        'label': self.label,
        'dependency': self.dependency,
        'has_cjk': self.has_cjk(),
    }

[docs]  def is_space(self):
    """Whether the chunk is a space.

    Returns:
      bool: True if it is a space.
    """
    return self.pos == self._SPACE_POS

[docs]  def is_punct(self):
    """Whether the chunk is a punctuation mark.

    See also https://en.wikipedia.org/wiki/Unicode_character_property

    Returns:
      bool: True if it is a punctuation mark.
    """
    return len(self.word) == 1 and unicodedata.category(self.word)[0] == 'P'

[docs]  def is_open_punct(self):
    """Whether the chunk is an open punctuation mark.

    Ps: Punctuation, open (e.g. opening bracket characters)
    Pi: Punctuation, initial quote (e.g. opening quotation mark)
    See also https://en.wikipedia.org/wiki/Unicode_character_property

    Returns:
      bool: True if it is an open punctuation mark.
    """
    return self.is_punct() and unicodedata.category(self.word) in {'Ps', 'Pi'}

[docs]  def has_cjk(self):
    """Checks if the word of the chunk contains CJK characters.

    This is using unicode codepoint ranges from
    https://github.com/nltk/nltk/blob/develop/nltk/tokenize/util.py#L149

    Returns:
      bool: True if the chunk has any CJK character.
    """
    cjk_codepoint_ranges = [
        (4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215),
        (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
    for char in self.word:
      if any([start <= ord(char) <= end
              for start, end in cjk_codepoint_ranges]):
        return True
    return False


[docs]class ChunkList(collections.MutableSequence):
  """List of :obj:`budou.chunk.Chunk` with some helpers.

  This list accepts only instances of :obj:`budou.chunk.Chunk`.

  Example:
    .. code-block:: python

       from budou.chunk import Chunk, ChunkList
       chunks = ChunkList(Chunk('abc'), Chunk('def'))
       chunks.append(Chunk('ghi'))  # OK
       chunks.append('jkl')         # NG

  Args:
    args (list of :obj:`budou.chunk.Chunk`): Initial values included in the
        list.
  """

  def __init__(self, *args):
    self.list = list()
    self.extend(list(args))

  def _check(self, val):
    """Checks if the value is an instance of :obj:`budou.chunk.Chunk`.

    Args:
      val (:obj:`budou.chunk.Chunk`): input to check

    Raises:
      TypeError: If :code:`val` is not an instance of :obj:`budou.chunk.Chunk`.
    """
    if not isinstance(val, Chunk):
      raise TypeError

  def __len__(self):
    return len(self.list)

  def __getitem__(self, i):
    return self.list[i]

  def __delitem__(self, i):
    del self.list[i]

  def __setitem__(self, i, v):
    self._check(v)
    self.list[i] = v

[docs]  def insert(self, index, value):
    self._check(value)
    self.list.insert(index, value)

[docs]  def get_overlaps(self, offset, length):
    """Returns chunks overlapped with the given range.

    Args:
      offset (int): Begin offset of the range.
      length (int): Length of the range.

    Returns:
      Overlapped chunks. (:obj:`budou.chunk.ChunkList`)
    """
    # In case entity's offset points to a space just before the entity.
    if ''.join([chunk.word for chunk in self])[offset] == ' ':
      offset += 1
    index = 0
    result = ChunkList()
    for chunk in self:
      if offset < index + len(chunk.word) and index < offset + length:
        result.append(chunk)
      index += len(chunk.word)
    return result

[docs]  def swap(self, old_chunks, new_chunk):
    """Swaps old consecutive chunks with new chunk.

    Args:
      old_chunks (:obj:`budou.chunk.ChunkList`): List of consecutive Chunks to
          be removed.
      new_chunk (:obj:`budou.chunk.Chunk`): A Chunk to be inserted.
    """
    indexes = [self.index(chunk) for chunk in old_chunks]
    del self[indexes[0]:indexes[-1] + 1]
    self.insert(indexes[0], new_chunk)

[docs]  def resolve_dependencies(self):
    """Resolves chunk dependency by concatenating them.
    """
    self._concatenate_inner(True)
    self._concatenate_inner(False)
    self._insert_breaklines()

  def _concatenate_inner(self, direction):
    """Concatenates chunks based on each chunk's dependency.

    Args:
      direction (bool): Direction of concatenation process. True for forward.
    """
    tmp_bucket = []
    source_chunks = self if direction else self[::-1]
    target_chunks = ChunkList()
    for chunk in source_chunks:
      if (
          # if the chunk has matched dependency, do concatenation.
          chunk.dependency == direction or
          # if the chunk is SPACE, concatenate to the previous chunk.
          (direction is False and chunk.is_space())
          ):
        tmp_bucket.append(chunk)
        continue
      tmp_bucket.append(chunk)
      if not direction:
        tmp_bucket = tmp_bucket[::-1]
      new_word = ''.join([tmp_chunk.word for tmp_chunk in tmp_bucket])
      new_chunk = Chunk(new_word, pos=chunk.pos, label=chunk.label,
                        dependency=chunk.dependency)
      target_chunks.append(new_chunk)
      tmp_bucket = ChunkList()
    if tmp_bucket:
      target_chunks += tmp_bucket
    if not direction:
      target_chunks = target_chunks[::-1]
    self.list = target_chunks

  def _insert_breaklines(self):
    """Inserts a breakline instead of a trailing space if the chunk is in CJK.
    """
    target_chunks = ChunkList()
    for chunk in self:
      if chunk.word[-1] == ' ' and chunk.has_cjk():
        chunk.word = chunk.word[:-1]
        target_chunks.append(chunk)
        target_chunks.append(chunk.breakline())
      else:
        target_chunks.append(chunk)
    self.list = target_chunks

[docs]  def html_serialize(self, attributes, max_length=None, use_wbr=False):
    """Returns concatenated HTML code with SPAN tag.

    Args:
      attributes (dict): A map of name-value pairs for attributes of output
          SPAN tags.
      max_length (int, optional): Maximum length of span enclosed chunk.
      use_wbr (bool, optional): Use WBR tag to serialize the output.

    Returns:
      The organized HTML code. (str)
    """
    if use_wbr:
      return self.wbr_serialize(max_length)
    else:
      return self.span_serialize(attributes, max_length)

[docs]  def span_serialize(self, attributes, max_length=None):
    """Returns concatenated HTML code with SPAN tag.

    Args:
      attributes (dict): A map of name-value pairs for attributes of output
          SPAN tags.
      max_length (int, optional): Maximum length of span enclosed chunk.

    Returns:
      The organized HTML code. (str)
    """
    doc = ET.Element('span')
    for chunk in self:
      if (chunk.has_cjk() and
          not (max_length and len(chunk.word) > max_length)):
        ele = ET.Element('span')
        ele.text = chunk.word
        for key, val in attributes.items():
          ele.attrib[key] = val
        doc.append(ele)
      else:
        # add word without span tag for non-CJK text (e.g. English)
        # by appending it after the last element
        if doc.getchildren():
          if doc.getchildren()[-1].tail is None:
            doc.getchildren()[-1].tail = chunk.word
          else:
            doc.getchildren()[-1].tail += chunk.word
        else:
          if doc.text is None:
            doc.text = chunk.word
          else:
            doc.text += chunk.word
    result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
    result = html5lib.serialize(
        html5lib.parseFragment(result), sanitize=True,
        quote_attr_values='always')
    return result

[docs]  def wbr_serialize(self):
    """Returns concatenated HTML code with WBR tag. This is still experimental.

    Returns:
      The organized HTML code. (str)
    """
    doc = ET.Element('span')
    doc.attrib['style'] = 'word-break: keep-all'
    for chunk in self:
      if (chunk.has_cjk() and doc.text):
        ele = ET.Element('wbr')
        doc.append(ele)
        doc.getchildren()[-1].tail = chunk.word
      else:
        # add word without span tag for non-CJK text (e.g. English)
        # by appending it after the last element
        if doc.getchildren():
          if doc.getchildren()[-1].tail is None:
            doc.getchildren()[-1].tail = chunk.word
          else:
            doc.getchildren()[-1].tail += chunk.word
        else:
          if doc.text is None:
            doc.text = chunk.word
          else:
            doc.text += chunk.word
    content = ET.tostring(doc, encoding='utf-8').decode('utf-8')
    dom = html5lib.parseFragment(content)
    treewalker = getTreeWalker('etree')
    stream = treewalker(dom)
    serializer = html5lib.serializer.HTMLSerializer(
            quote_attr_values='always')
    allowed_elements = set(sanitizer.allowed_elements)
    allowed_elements.add((namespaces['html'], 'wbr'))
    allowed_css_properties = set(sanitizer.allowed_css_properties)
    allowed_css_properties.add('word-break')
    result = serializer.render(sanitizer.Filter(
        stream, allowed_elements=allowed_elements,
        allowed_css_properties=allowed_css_properties,
        ))
    return result

[docs]  def separator_serialize(self, separator):
    """Returns concatenated chunks with a custom separator in between.

    Returns:
      The organized string with custom separator (str)
    """
    result = []
    for chunk in self:
      result.append(chunk.word)
    return separator.join(result)
Source code for budou.chunk

budou

Navigation

Related Topics