# -*- coding: utf-8 -*-
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Chunk module as a unit of word segment with helpers.
"""
import collections
from xml.etree import ElementTree as ET
import unicodedata
import html5lib
from html5lib import getTreeWalker
from html5lib.filters import sanitizer
from html5lib.constants import namespaces
[docs]class Chunk:
"""A unit for word segmentation.
Attributes:
word (str): Surface word of the chunk.
pos (str, optional): Part of speech.
label (str, optional): Label information.
dependency (bool, optional): Dependency to neighbor words.
:code:`None` for no dependency, :code:`True` for dependency to the
following word, and :code:`False` for the dependency to the previous
word.
Args:
word (str): Surface word of the chunk.
pos (str, optional): Part of speech.
label (str, optional): Label information.
dependency (bool, optional): Dependency to neighbor words.
:code:`None` for no dependency, :code:`True` for dependency to the
following word, and :code:`False` for the dependency to the previous
word.
"""
_SPACE_POS = 'SPACE'
_BREAK_POS = 'BREAK'
def __init__(self, word, pos=None, label=None, dependency=None):
self.word = word
self.pos = pos
self.label = label
self.dependency = dependency
def __repr__(self):
return 'Chunk(%s, %s, %s, %s)' % (
repr(self.word), self.pos, self.label, self.dependency)
[docs] @classmethod
def space(cls):
"""Creates space Chunk.
Returns:
A chunk (:obj:`budou.chunk.Chunk`)
"""
chunk = cls(u' ', cls._SPACE_POS)
return chunk
[docs] @classmethod
def breakline(cls):
"""Creates breakline Chunk.
Returns:
A chunk (:obj:`budou.chunk.Chunk`)
"""
chunk = cls(u'\n', cls._BREAK_POS)
return chunk
[docs] def serialize(self):
"""Returns serialized chunk data in dictionary."""
return {
'word': self.word,
'pos': self.pos,
'label': self.label,
'dependency': self.dependency,
'has_cjk': self.has_cjk(),
}
[docs] def is_space(self):
"""Whether the chunk is a space.
Returns:
bool: True if it is a space.
"""
return self.pos == self._SPACE_POS
[docs] def is_punct(self):
"""Whether the chunk is a punctuation mark.
See also https://en.wikipedia.org/wiki/Unicode_character_property
Returns:
bool: True if it is a punctuation mark.
"""
return len(self.word) == 1 and unicodedata.category(self.word)[0] == 'P'
[docs] def is_open_punct(self):
"""Whether the chunk is an open punctuation mark.
Ps: Punctuation, open (e.g. opening bracket characters)
Pi: Punctuation, initial quote (e.g. opening quotation mark)
See also https://en.wikipedia.org/wiki/Unicode_character_property
Returns:
bool: True if it is an open punctuation mark.
"""
return self.is_punct() and unicodedata.category(self.word) in {'Ps', 'Pi'}
[docs] def has_cjk(self):
"""Checks if the word of the chunk contains CJK characters.
This is using unicode codepoint ranges from
https://github.com/nltk/nltk/blob/develop/nltk/tokenize/util.py#L149
Returns:
bool: True if the chunk has any CJK character.
"""
cjk_codepoint_ranges = [
(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215),
(63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
for char in self.word:
if any([start <= ord(char) <= end
for start, end in cjk_codepoint_ranges]):
return True
return False
[docs]class ChunkList(collections.MutableSequence):
"""List of :obj:`budou.chunk.Chunk` with some helpers.
This list accepts only instances of :obj:`budou.chunk.Chunk`.
Example:
.. code-block:: python
from budou.chunk import Chunk, ChunkList
chunks = ChunkList(Chunk('abc'), Chunk('def'))
chunks.append(Chunk('ghi')) # OK
chunks.append('jkl') # NG
Args:
args (list of :obj:`budou.chunk.Chunk`): Initial values included in the
list.
"""
def __init__(self, *args):
self.list = list()
self.extend(list(args))
def _check(self, val):
"""Checks if the value is an instance of :obj:`budou.chunk.Chunk`.
Args:
val (:obj:`budou.chunk.Chunk`): input to check
Raises:
TypeError: If :code:`val` is not an instance of :obj:`budou.chunk.Chunk`.
"""
if not isinstance(val, Chunk):
raise TypeError
def __len__(self):
return len(self.list)
def __getitem__(self, i):
return self.list[i]
def __delitem__(self, i):
del self.list[i]
def __setitem__(self, i, v):
self._check(v)
self.list[i] = v
[docs] def insert(self, index, value):
self._check(value)
self.list.insert(index, value)
[docs] def get_overlaps(self, offset, length):
"""Returns chunks overlapped with the given range.
Args:
offset (int): Begin offset of the range.
length (int): Length of the range.
Returns:
Overlapped chunks. (:obj:`budou.chunk.ChunkList`)
"""
# In case entity's offset points to a space just before the entity.
if ''.join([chunk.word for chunk in self])[offset] == ' ':
offset += 1
index = 0
result = ChunkList()
for chunk in self:
if offset < index + len(chunk.word) and index < offset + length:
result.append(chunk)
index += len(chunk.word)
return result
[docs] def swap(self, old_chunks, new_chunk):
"""Swaps old consecutive chunks with new chunk.
Args:
old_chunks (:obj:`budou.chunk.ChunkList`): List of consecutive Chunks to
be removed.
new_chunk (:obj:`budou.chunk.Chunk`): A Chunk to be inserted.
"""
indexes = [self.index(chunk) for chunk in old_chunks]
del self[indexes[0]:indexes[-1] + 1]
self.insert(indexes[0], new_chunk)
[docs] def resolve_dependencies(self):
"""Resolves chunk dependency by concatenating them.
"""
self._concatenate_inner(True)
self._concatenate_inner(False)
self._insert_breaklines()
def _concatenate_inner(self, direction):
"""Concatenates chunks based on each chunk's dependency.
Args:
direction (bool): Direction of concatenation process. True for forward.
"""
tmp_bucket = []
source_chunks = self if direction else self[::-1]
target_chunks = ChunkList()
for chunk in source_chunks:
if (
# if the chunk has matched dependency, do concatenation.
chunk.dependency == direction or
# if the chunk is SPACE, concatenate to the previous chunk.
(direction is False and chunk.is_space())
):
tmp_bucket.append(chunk)
continue
tmp_bucket.append(chunk)
if not direction:
tmp_bucket = tmp_bucket[::-1]
new_word = ''.join([tmp_chunk.word for tmp_chunk in tmp_bucket])
new_chunk = Chunk(new_word, pos=chunk.pos, label=chunk.label,
dependency=chunk.dependency)
target_chunks.append(new_chunk)
tmp_bucket = ChunkList()
if tmp_bucket:
target_chunks += tmp_bucket
if not direction:
target_chunks = target_chunks[::-1]
self.list = target_chunks
def _insert_breaklines(self):
"""Inserts a breakline instead of a trailing space if the chunk is in CJK.
"""
target_chunks = ChunkList()
for chunk in self:
if chunk.word[-1] == ' ' and chunk.has_cjk():
chunk.word = chunk.word[:-1]
target_chunks.append(chunk)
target_chunks.append(chunk.breakline())
else:
target_chunks.append(chunk)
self.list = target_chunks
[docs] def html_serialize(self, attributes, max_length=None, use_wbr=False):
"""Returns concatenated HTML code with SPAN tag.
Args:
attributes (dict): A map of name-value pairs for attributes of output
SPAN tags.
max_length (int, optional): Maximum length of span enclosed chunk.
use_wbr (bool, optional): Use WBR tag to serialize the output.
Returns:
The organized HTML code. (str)
"""
if use_wbr:
return self.wbr_serialize(max_length)
else:
return self.span_serialize(attributes, max_length)
[docs] def span_serialize(self, attributes, max_length=None):
"""Returns concatenated HTML code with SPAN tag.
Args:
attributes (dict): A map of name-value pairs for attributes of output
SPAN tags.
max_length (int, optional): Maximum length of span enclosed chunk.
Returns:
The organized HTML code. (str)
"""
doc = ET.Element('span')
for chunk in self:
if (chunk.has_cjk() and
not (max_length and len(chunk.word) > max_length)):
ele = ET.Element('span')
ele.text = chunk.word
for key, val in attributes.items():
ele.attrib[key] = val
doc.append(ele)
else:
# add word without span tag for non-CJK text (e.g. English)
# by appending it after the last element
if doc.getchildren():
if doc.getchildren()[-1].tail is None:
doc.getchildren()[-1].tail = chunk.word
else:
doc.getchildren()[-1].tail += chunk.word
else:
if doc.text is None:
doc.text = chunk.word
else:
doc.text += chunk.word
result = ET.tostring(doc, encoding='utf-8').decode('utf-8')
result = html5lib.serialize(
html5lib.parseFragment(result), sanitize=True,
quote_attr_values='always')
return result
[docs] def wbr_serialize(self):
"""Returns concatenated HTML code with WBR tag. This is still experimental.
Returns:
The organized HTML code. (str)
"""
doc = ET.Element('span')
doc.attrib['style'] = 'word-break: keep-all'
for chunk in self:
if (chunk.has_cjk() and doc.text):
ele = ET.Element('wbr')
doc.append(ele)
doc.getchildren()[-1].tail = chunk.word
else:
# add word without span tag for non-CJK text (e.g. English)
# by appending it after the last element
if doc.getchildren():
if doc.getchildren()[-1].tail is None:
doc.getchildren()[-1].tail = chunk.word
else:
doc.getchildren()[-1].tail += chunk.word
else:
if doc.text is None:
doc.text = chunk.word
else:
doc.text += chunk.word
content = ET.tostring(doc, encoding='utf-8').decode('utf-8')
dom = html5lib.parseFragment(content)
treewalker = getTreeWalker('etree')
stream = treewalker(dom)
serializer = html5lib.serializer.HTMLSerializer(
quote_attr_values='always')
allowed_elements = set(sanitizer.allowed_elements)
allowed_elements.add((namespaces['html'], 'wbr'))
allowed_css_properties = set(sanitizer.allowed_css_properties)
allowed_css_properties.add('word-break')
result = serializer.render(sanitizer.Filter(
stream, allowed_elements=allowed_elements,
allowed_css_properties=allowed_css_properties,
))
return result
[docs] def separator_serialize(self, separator):
"""Returns concatenated chunks with a custom separator in between.
Returns:
The organized string with custom separator (str)
"""
result = []
for chunk in self:
result.append(chunk.word)
return separator.join(result)