Source code for budou.nlapisegmenter

# -*- coding: utf-8 -*-
# Copyright 2018 Google LLC
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

"""Natural Language API based Segmenter.

Word segmenter module powered by
`Cloud Natural Language API <>`_.
You need to enable the API in your Google Cloud Platform project before you
use this module.

  Once you enabled the API, download a service account's credentials and set
  as `GOOGLE_APPLICATION_CREDENTIALS` environment variable.

  .. code-block:: bash

     $ export GOOGLE_APPLICATION_CREDENTIALS='/path/to/credentials.json'

  Alternatively, you can also pass the path to your credentials file to the

  .. code-block:: python

     segmenter = budou.segmenter.NLAPISegmenter(

This module is equipped with caching system not to make multiple requests for
the same source sentence because making request to the API may incur costs.
The caching system is provided by `budou.cachefactory`, and a proper caching
system is chosen to be used based on the environment.


from __future__ import unicode_literals
import logging
import hashlib
from .segmenter import Segmenter
from .cachefactory import load_cache
from .chunk import Chunk, ChunkList

    'P', 'SNUM', 'PRT', 'AUX', 'SUFF', 'AUXPASS', 'RDROP', 'NUMBER', 'NUM',
""" list of str: Labels dependent to other parts.

def _memorize(func):
  """Decorator to cache the given function's output.

  def _wrapper(self, *args, **kwargs):
    """Wrapper to cache the function's output.
    if self.use_cache:
      cache = load_cache(self.cache_filename)
      original_key = ':'.join([
          '_'.join([str(a) for a in args]),
          '_'.join([str(w) for w in kwargs.values()])])
      cache_key = hashlib.md5(original_key.encode('utf-8')).hexdigest()
      cached_val = cache.get(cache_key)
      if cached_val:
        return cached_val
    val = func(self, *args, **kwargs)
    if self.use_cache:
      cache.set(cache_key, val)
    return val
  return _wrapper

[docs]class NLAPISegmenter(Segmenter): """Natural Language API Segmenter. Attributes: service: A resource object for interacting with Cloud Natural Language API. cache_filename (str): File path to the cache file. supported_languages (list of str): List of supported languages' codes. Args: cache_filename (str, optional): File path to the pickle file for caching. The file is created automatically if not exist. If the environment is Google App Engine Standard Environment and memcache service is available, it is used for caching and the pickle file won't be generated. credentials_path (str, optional): File path to the service account's credentials file. If no file path is specified, it tries to authenticate with default credentials. use_entity (bool, optional): Whether to use entity analysis results to wrap entity names in the output. use_cache (bool, optional): Whether to use a cache system. cache_discovery (bool, optional): Whether to use the cache to build the natural language API service [default: True]. When using oauth2client >= 4.0.0 or google-auth, its value should be False. service (:obj:`googleapiclient.discovery.Resource`, optional): A Resource object for interacting with Cloud Natural Language API. If this is given, the constructor skips the authentication process and use this service instead. """ supported_languages = {'ja', 'ko', 'zh', 'zh-TW', 'zh-CN', 'zh-HK', 'zh-Hant'} def __init__(self, cache_filename, credentials_path, use_entity, use_cache, cache_discovery=True, service=None): self.cache_filename = cache_filename self.credentials_path = credentials_path self.use_entity = use_entity self.use_cache = use_cache self.service = service if self.service is None: self.service = self._authenticate(cache_discovery) def _authenticate(self, cache_discovery): import google_auth_httplib2 import googleapiclient.discovery scope = [''] if self.credentials_path: try: from google.oauth2 import service_account credentials = service_account.Credentials.from_service_account_file( self.credentials_path) scoped_credentials = credentials.with_scopes(scope) except ImportError: logging.error('Failed to load google.oauth2.service_account module. ' 'If you are running this script in Google App Engine ' 'environment, you can initialize the segmenter with ' 'default credentials.') else: import google.auth scoped_credentials, _ = google.auth.default(scope) authed_http = google_auth_httplib2.AuthorizedHttp(scoped_credentials) service = 'language', 'v1beta2', http=authed_http, cache_discovery=cache_discovery) return service
[docs] def segment(self, source, language=None): """Returns a chunk list from the given sentence. Args: source (str): Source string to segment. language (str, optional): A language code. Returns: A chunk list. (:obj:`budou.chunk.ChunkList`) Raises: ValueError: If :code:`language` is given and it is not included in :code:`supported_languages`. """ if language and not language in self.supported_languages: raise ValueError( 'Language {} is not supported by NLAPI segmenter'.format(language)) chunks, language = self._get_source_chunks(source, language=language) if self.use_entity: entities = self._get_entities(source, language=language) chunks = self._group_chunks_by_entities(chunks, entities) chunks.resolve_dependencies() return chunks
def _get_source_chunks(self, input_text, language=None): """Returns a chunk list retrieved from Syntax Analysis results. Args: input_text (str): Text to annotate. language (str, optional): Language of the text. Returns: A chunk list. (:obj:`budou.chunk.ChunkList`) """ chunks = ChunkList() seek = 0 result = self._get_annotations(input_text, language=language) tokens = result['tokens'] language = result['language'] for i, token in enumerate(tokens): word = token['text']['content'] begin_offset = token['text']['beginOffset'] label = token['dependencyEdge']['label'] pos = token['partOfSpeech']['tag'] if begin_offset > seek: chunks.append( seek = begin_offset chunk = Chunk(word, pos, label) if chunk.label in _DEPENDENT_LABEL: # Determining concatenating direction based on syntax dependency. chunk.dependency = i < token['dependencyEdge']['headTokenIndex'] if chunk.is_punct(): chunk.dependency = chunk.is_open_punct() chunks.append(chunk) seek += len(word) return chunks, language def _group_chunks_by_entities(self, chunks, entities): """Groups chunks by entities retrieved from NL API Entity Analysis. Args: chunks (:obj:`budou.chunk.ChunkList`): List of chunks to be processed. entities (:obj:`list` of :obj:`dict`): List of entities. Returns: A chunk list. (:obj:`budou.chunk.ChunkList`) """ for entity in entities: chunks_to_concat = chunks.get_overlaps( entity['beginOffset'], len(entity['content'])) if not chunks_to_concat: continue new_chunk_word = u''.join([chunk.word for chunk in chunks_to_concat]) new_chunk = Chunk(new_chunk_word) chunks.swap(chunks_to_concat, new_chunk) return chunks @_memorize def _get_annotations(self, text, language=''): """Returns the list of annotations retrieved from the given text. Args: text (str): Input text. language (str, optional): Language code. Returns: Results in a dictionary. :code:`tokens` contains the list of annotations and :code:`language` contains the inferred language from the input. """ body = { 'document': { 'type': 'PLAIN_TEXT', 'content': text, }, 'features': { 'extract_syntax': True, }, 'encodingType': 'UTF32', } if language: body['document']['language'] = language request = self.service.documents().annotateText(body=body) response = request.execute() tokens = response.get('tokens', []) language = response.get('language') return {'tokens': tokens, 'language': language} @_memorize def _get_entities(self, text, language=''): """Returns the list of entities retrieved from the given text. Args: text (str): Input text. language (str, optional): Language code. Returns: List of entities. """ body = { 'document': { 'type': 'PLAIN_TEXT', 'content': text, }, 'encodingType': 'UTF32', } if language: body['document']['language'] = language request = self.service.documents().analyzeEntities(body=body) response = request.execute() result = [] for entity in response.get('entities', []): mentions = entity.get('mentions', []) if not mentions: continue entity_text = mentions[0]['text'] offset = entity_text['beginOffset'] for word in entity_text['content'].split(): result.append({'content': word, 'beginOffset': offset}) offset += len(word) return result