Source code for budou.parser

# -*- coding: utf-8 -*-
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Parser modules.

Parser modules have :code:`parse` method which processes the input text into a
list of chunks and a HTML snippet.

Examples:

  .. code-block:: python

     import budou
     parser = budou.get_parser('nlapi')
     results = parser.parse('Google Home を使った。', classname='w')
     print(results['html_code'])
     # <span>Google <span class="w">Home を</span>
     # <span class="w">使った。</span></span>

     chunks = results['chunks']
     print(chunks[1].word)  # Home を

"""

from abc import ABCMeta
import re
from xml.etree import ElementTree as ET
import six
import html5lib
from .nlapisegmenter import NLAPISegmenter
from .mecabsegmenter import MecabSegmenter
from .tinysegmentersegmenter import TinysegmenterSegmenter

DEFAULT_CLASS_NAME = 'ww'

[docs]@six.add_metaclass(ABCMeta) class Parser: """Abstract parser class: Attributes: segmenter(:obj:`budou.segmenter.Segmenter`): Segmenter module. """ def __init__(self): self.segmenter = None
[docs] def parse(self, source, language=None, classname=None, max_length=None, attributes=None, inlinestyle=False): """Parses the source sentence to output organized HTML code. Args: source (str): Source sentence to process. language (str, optional): Language code. max_length (int, optional): Maximum length of a chunk. attributes (dict, optional): Attributes for output SPAN tags. inlinestyle (bool, optional): Add :code:`display:inline-block` as inline style attribute. Returns: A dictionary containing :code:`chunks` (:obj:`budou.chunk.ChunkList`) and :code:`html_code` (str). """ attributes = parse_attributes(attributes, classname, inlinestyle) source = preprocess(source) chunks = self.segmenter.segment(source, language) html_code = chunks.html_serialize(attributes, max_length=max_length) return { 'chunks': chunks, 'html_code': html_code, }
[docs]class NLAPIParser(Parser): """Parser built on Cloud Language API Segmenter (:obj:`budou.nlapisegmenter.NLAPISegmenter`). Args: cache_filename (string, optional): the path to the cache file. credentials_path (string, optional): the path to the service account's credentials file. use_entity (bool, optional): Whether to use entity analysis results to wrap entity names in the output. use_cache (bool, optional): Whether to use a cache system. service (:obj:`googleapiclient.discovery.Resource`, optional): A Resource object for interacting with Cloud Natural Language API. If this is given, the constructor skips the authentication process and use this service instead. Attributes: segmenter(:obj:`budou.nlapisegmenter.NLAPISegmenter`): Segmenter module. """ def __init__(self, **options): self.segmenter = NLAPISegmenter( cache_filename=options.get('cache_filename', None), credentials_path=options.get('credentials_path', None), use_entity=options.get('use_entity', False), use_cache=options.get('use_cache', True), service=options.get('service', None), )
[docs]class MecabParser(Parser): """Parser built on Mecab Segmenter (:obj:`budou.mecabsegmenter.MecabSegmenter`). Attributes: segmenter(:obj:`budou.mecabsegmenter.MecabSegmenter`): Segmenter module. """ def __init__(self): self.segmenter = MecabSegmenter()
[docs]class TinysegmenterParser(Parser): """Parser built on TinySegmenter Segmenter (:obj:`budou.tinysegmentersegmenter.TinysegmenterSegmenter`). Attributes: segmenter(:obj:`budou.tinysegmentersegmenter.TinysegmenterSegmenter`): Segmenter module. """ def __init__(self): self.segmenter = TinysegmenterSegmenter()
[docs]def get_parser(segmenter, **options): """Gets a parser. Args: segmenter (str): Segmenter to use. options (dict, optional): Optional settings. Returns: Parser (:obj:`budou.parser.Parser`) Raises: ValueError: If unsupported segmenter is specified. """ if segmenter == 'nlapi': return NLAPIParser(**options) elif segmenter == 'mecab': return MecabParser() elif segmenter == 'tinysegmenter': return TinysegmenterParser() else: raise ValueError('Segmenter {} is not supported.'.format(segmenter))
[docs]def parse_attributes(attributes=None, classname=None, inlinestyle=False): """Parses attributes, Args: attributes (dict): Input attributes. classname (str, optional): Class name of output SPAN tags. inlinestyle (bool, optional): Add :code:`display:inline-block` as inline style attribute. Returns: Parsed attributes. (dict) """ if not attributes: attributes = {} attributes.setdefault('class', DEFAULT_CLASS_NAME) # If `classname` is specified, it overwrites `class` property in `attributes`. if classname: attributes['class'] = ' '.join(classname.split(',')) if inlinestyle: styles = attributes['style'].split(';') if 'style' in attributes else [] styles.append('display:inline-block') attributes['style'] = ';'.join(styles) return attributes
[docs]def preprocess(source): """Removes unnecessary break lines and white spaces. Args: source (str): Input sentence. Returns: Preprocessed sentence. (str) """ doc = html5lib.parseFragment(source) source = ET.tostring(doc, encoding='utf-8', method='text').decode('utf-8') source = source.replace(u'\n', u'').strip() source = re.sub(r'\s\s+', u' ', source) return source