# -*- coding: utf-8 -*-
# Copyright 2018 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

"""Chunk module as a unit of word segment with helpers.

import collections
from xml.etree import ElementTree as ET
import unicodedata
import html5lib
from html5lib import getTreeWalker
from html5lib.filters import sanitizer
from html5lib.constants import namespaces

[docs]class Chunk: """A unit for word segmentation. Attributes: word (str): Surface word of the chunk. pos (str, optional): Part of speech. label (str, optional): Label information. dependency (bool, optional): Dependency to neighbor words. :code:`None` for no dependency, :code:`True` for dependency to the following word, and :code:`False` for the dependency to the previous word. Args: word (str): Surface word of the chunk. pos (str, optional): Part of speech. label (str, optional): Label information. dependency (bool, optional): Dependency to neighbor words. :code:`None` for no dependency, :code:`True` for dependency to the following word, and :code:`False` for the dependency to the previous word. """ _SPACE_POS = 'SPACE' _BREAK_POS = 'BREAK' def __init__(self, word, pos=None, label=None, dependency=None): self.word = word self.pos = pos self.label = label self.dependency = dependency def __repr__(self): return 'Chunk(%s, %s, %s, %s)' % ( repr(self.word), self.pos, self.label, self.dependency)
[docs] @classmethod def space(cls): """Creates space Chunk. Returns: A chunk (:obj:`budou.chunk.Chunk`) """ chunk = cls(u' ', cls._SPACE_POS) return chunk
[docs] @classmethod def breakline(cls): """Creates breakline Chunk. Returns: A chunk (:obj:`budou.chunk.Chunk`) """ chunk = cls(u'\n', cls._BREAK_POS) return chunk
[docs] def serialize(self): """Returns serialized chunk data in dictionary.""" return { 'word': self.word, 'pos': self.pos, 'label': self.label, 'dependency': self.dependency, 'has_cjk': self.has_cjk(), }
[docs] def is_space(self): """Whether the chunk is a space. Returns: bool: True if it is a space. """ return self.pos == self._SPACE_POS
[docs] def is_punct(self): """Whether the chunk is a punctuation mark. See also Returns: bool: True if it is a punctuation mark. """ return len(self.word) == 1 and unicodedata.category(self.word)[0] == 'P'
[docs] def is_open_punct(self): """Whether the chunk is an open punctuation mark. Ps: Punctuation, open (e.g. opening bracket characters) Pi: Punctuation, initial quote (e.g. opening quotation mark) See also Returns: bool: True if it is an open punctuation mark. """ return self.is_punct() and unicodedata.category(self.word) in {'Ps', 'Pi'}
[docs] def has_cjk(self): """Checks if the word of the chunk contains CJK characters. This is using unicode codepoint ranges from Returns: bool: True if the chunk has any CJK character. """ cjk_codepoint_ranges = [ (4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)] for char in self.word: if any([start <= ord(char) <= end for start, end in cjk_codepoint_ranges]): return True return False
[docs]class ChunkList(collections.MutableSequence): """List of :obj:`budou.chunk.Chunk` with some helpers. This list accepts only instances of :obj:`budou.chunk.Chunk`. Example: .. code-block:: python from budou.chunk import Chunk, ChunkList chunks = ChunkList(Chunk('abc'), Chunk('def')) chunks.append(Chunk('ghi')) # OK chunks.append('jkl') # NG Args: args (list of :obj:`budou.chunk.Chunk`): Initial values included in the list. """ def __init__(self, *args): self.list = list() self.extend(list(args)) def _check(self, val): """Checks if the value is an instance of :obj:`budou.chunk.Chunk`. Args: val (:obj:`budou.chunk.Chunk`): input to check Raises: TypeError: If :code:`val` is not an instance of :obj:`budou.chunk.Chunk`. """ if not isinstance(val, Chunk): raise TypeError def __len__(self): return len(self.list) def __getitem__(self, i): return self.list[i] def __delitem__(self, i): del self.list[i] def __setitem__(self, i, v): self._check(v) self.list[i] = v
[docs] def insert(self, index, value): self._check(value) self.list.insert(index, value)
[docs] def get_overlaps(self, offset, length): """Returns chunks overlapped with the given range. Args: offset (int): Begin offset of the range. length (int): Length of the range. Returns: Overlapped chunks. (:obj:`budou.chunk.ChunkList`) """ # In case entity's offset points to a space just before the entity. if ''.join([chunk.word for chunk in self])[offset] == ' ': offset += 1 index = 0 result = ChunkList() for chunk in self: if offset < index + len(chunk.word) and index < offset + length: result.append(chunk) index += len(chunk.word) return result
[docs] def swap(self, old_chunks, new_chunk): """Swaps old consecutive chunks with new chunk. Args: old_chunks (:obj:`budou.chunk.ChunkList`): List of consecutive Chunks to be removed. new_chunk (:obj:`budou.chunk.Chunk`): A Chunk to be inserted. """ indexes = [self.index(chunk) for chunk in old_chunks] del self[indexes[0]:indexes[-1] + 1] self.insert(indexes[0], new_chunk)
[docs] def resolve_dependencies(self): """Resolves chunk dependency by concatenating them. """ self._concatenate_inner(True) self._concatenate_inner(False) self._insert_breaklines()
def _concatenate_inner(self, direction): """Concatenates chunks based on each chunk's dependency. Args: direction (bool): Direction of concatenation process. True for forward. """ tmp_bucket = [] source_chunks = self if direction else self[::-1] target_chunks = ChunkList() for chunk in source_chunks: if ( # if the chunk has matched dependency, do concatenation. chunk.dependency == direction or # if the chunk is SPACE, concatenate to the previous chunk. (direction is False and chunk.is_space()) ): tmp_bucket.append(chunk) continue tmp_bucket.append(chunk) if not direction: tmp_bucket = tmp_bucket[::-1] new_word = ''.join([tmp_chunk.word for tmp_chunk in tmp_bucket]) new_chunk = Chunk(new_word, pos=chunk.pos, label=chunk.label, dependency=chunk.dependency) target_chunks.append(new_chunk) tmp_bucket = ChunkList() if tmp_bucket: target_chunks += tmp_bucket if not direction: target_chunks = target_chunks[::-1] self.list = target_chunks def _insert_breaklines(self): """Inserts a breakline instead of a trailing space if the chunk is in CJK. """ target_chunks = ChunkList() for chunk in self: if chunk.word[-1] == ' ' and chunk.has_cjk(): chunk.word = chunk.word[:-1] target_chunks.append(chunk) target_chunks.append(chunk.breakline()) else: target_chunks.append(chunk) self.list = target_chunks
[docs] def html_serialize(self, attributes, max_length=None, use_wbr=False): """Returns concatenated HTML code with SPAN tag. Args: attributes (dict): A map of name-value pairs for attributes of output SPAN tags. max_length (int, optional): Maximum length of span enclosed chunk. use_wbr (bool, optional): Use WBR tag to serialize the output. Returns: The organized HTML code. (str) """ if use_wbr: return self.wbr_serialize(max_length) else: return self.span_serialize(attributes, max_length)
[docs] def span_serialize(self, attributes, max_length=None): """Returns concatenated HTML code with SPAN tag. Args: attributes (dict): A map of name-value pairs for attributes of output SPAN tags. max_length (int, optional): Maximum length of span enclosed chunk. Returns: The organized HTML code. (str) """ doc = ET.Element('span') for chunk in self: if (chunk.has_cjk() and not (max_length and len(chunk.word) > max_length)): ele = ET.Element('span') ele.text = chunk.word for key, val in attributes.items(): ele.attrib[key] = val doc.append(ele) else: # add word without span tag for non-CJK text (e.g. English) # by appending it after the last element if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = chunk.word else: doc.getchildren()[-1].tail += chunk.word else: if doc.text is None: doc.text = chunk.word else: doc.text += chunk.word result = ET.tostring(doc, encoding='utf-8').decode('utf-8') result = html5lib.serialize( html5lib.parseFragment(result), sanitize=True, quote_attr_values='always') return result
[docs] def wbr_serialize(self): """Returns concatenated HTML code with WBR tag. This is still experimental. Returns: The organized HTML code. (str) """ doc = ET.Element('span') doc.attrib['style'] = 'word-break: keep-all' for chunk in self: if (chunk.has_cjk() and doc.text): ele = ET.Element('wbr') doc.append(ele) doc.getchildren()[-1].tail = chunk.word else: # add word without span tag for non-CJK text (e.g. English) # by appending it after the last element if doc.getchildren(): if doc.getchildren()[-1].tail is None: doc.getchildren()[-1].tail = chunk.word else: doc.getchildren()[-1].tail += chunk.word else: if doc.text is None: doc.text = chunk.word else: doc.text += chunk.word content = ET.tostring(doc, encoding='utf-8').decode('utf-8') dom = html5lib.parseFragment(content) treewalker = getTreeWalker('etree') stream = treewalker(dom) serializer = html5lib.serializer.HTMLSerializer( quote_attr_values='always') allowed_elements = set(sanitizer.allowed_elements) allowed_elements.add((namespaces['html'], 'wbr')) allowed_css_properties = set(sanitizer.allowed_css_properties) allowed_css_properties.add('word-break') result = serializer.render(sanitizer.Filter( stream, allowed_elements=allowed_elements, allowed_css_properties=allowed_css_properties, )) return result
[docs] def separator_serialize(self, separator): """Returns concatenated chunks with a custom separator in between. Returns: The organized string with custom separator (str) """ result = [] for chunk in self: result.append(chunk.word) return separator.join(result)