# -*- coding: utf-8 -*-
#
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Natural Language API based Segmenter.
Word segmenter module powered by
`Cloud Natural Language API <https://cloud.google.com/natural-language/>`_.
You need to enable the API in your Google Cloud Platform project before you
use this module.
Example:
Once you enabled the API, download a service account's credentials and set
as `GOOGLE_APPLICATION_CREDENTIALS` environment variable.
.. code-block:: bash
$ export GOOGLE_APPLICATION_CREDENTIALS='/path/to/credentials.json'
Alternatively, you can also pass the path to your credentials file to the
module.
.. code-block:: python
segmenter = budou.segmenter.NLAPISegmenter(
credentials_path='/path/to/credentials.json')
This module is equipped with caching system not to make multiple requests for
the same source sentence because making request to the API may incur costs.
The caching system is provided by `budou.cachefactory`, and a proper caching
system is chosen to be used based on the environment.
"""
from __future__ import unicode_literals
import logging
import hashlib
from builtins import str
from .segmenter import Segmenter
from .cachefactory import load_cache
from .chunk import Chunk, ChunkList
_DEPENDENT_LABEL = (
'P', 'SNUM', 'PRT', 'AUX', 'SUFF', 'AUXPASS', 'RDROP', 'NUMBER', 'NUM',
'PREF')
""" list of str: Labels dependent to other parts.
"""
[docs]def generate_hash(classname, funcname, *args, **kwargs):
key = ':'.join([
classname, funcname,
'_'.join([str(a) for a in args]),
'_'.join([str(w) for w in kwargs.values()])])
key = hashlib.md5(key.encode('utf-8')).hexdigest()
return key
def _memorize(func):
"""Decorator to cache the given function's output.
"""
def _wrapper(self, *args, **kwargs):
"""Wrapper to cache the function's output.
"""
if self.use_cache:
cache = load_cache(self.cache_filename)
original_key = generate_hash(
self.__class__.__name__, func.__name__, args, kwargs)
cache_key = hashlib.md5(original_key.encode('utf-8')).hexdigest()
cached_val = cache.get(cache_key)
if cached_val:
return cached_val
val = func(self, *args, **kwargs)
if self.use_cache:
cache.set(cache_key, val)
return val
return _wrapper
[docs]class NLAPISegmenter(Segmenter):
"""Natural Language API Segmenter.
Attributes:
service: A resource object for interacting with Cloud Natural Language API.
cache_filename (str): File path to the cache file.
supported_languages (list of str): List of supported languages' codes.
Args:
cache_filename (str, optional): File path to the pickle file for
caching. The file is created automatically if not exist. If the
environment is Google App Engine Standard Environment and memcache
service is available, it is used for caching and the pickle file
won't be generated.
credentials_path (str, optional): File path to the service
account's credentials file. If no file path is specified, it tries
to authenticate with default credentials.
use_entity (bool, optional): Whether to use entity analysis
results to wrap entity names in the output.
use_cache (bool, optional): Whether to use a cache system.
cache_discovery (bool, optional): Whether to use the cache to
build the natural language API service [default: True]. When using
oauth2client >= 4.0.0 or google-auth, its value should be False.
service (:obj:`googleapiclient.discovery.Resource`, optional): A Resource
object for interacting with Cloud Natural Language API. If this is
given, the constructor skips the authentication process and use this
service instead.
"""
supported_languages = {'ja', 'ko', 'zh', 'zh-TW', 'zh-CN', 'zh-HK', 'zh-Hant'}
def __init__(self, cache_filename, credentials_path, use_entity, use_cache,
cache_discovery=True, service=None):
self.cache_filename = cache_filename
self.credentials_path = credentials_path
self.use_entity = use_entity
self.use_cache = use_cache
self.service = service
if self.service is None:
self.service = self._authenticate(cache_discovery)
def _authenticate(self, cache_discovery):
import google_auth_httplib2
import googleapiclient.discovery
scope = ['https://www.googleapis.com/auth/cloud-platform']
if self.credentials_path:
try:
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file(
self.credentials_path)
scoped_credentials = credentials.with_scopes(scope)
except ImportError:
logging.error('Failed to load google.oauth2.service_account module. '
'If you are running this script in Google App Engine '
'environment, you can initialize the segmenter with '
'default credentials.')
else:
import google.auth
scoped_credentials, _ = google.auth.default(scope)
authed_http = google_auth_httplib2.AuthorizedHttp(scoped_credentials)
service = googleapiclient.discovery.build(
'language', 'v1beta2', http=authed_http,
cache_discovery=cache_discovery)
return service
[docs] def segment(self, source, language=None):
"""Returns a chunk list from the given sentence.
Args:
source (str): Source string to segment.
language (str, optional): A language code.
Returns:
A chunk list. (:obj:`budou.chunk.ChunkList`)
Raises:
ValueError: If :code:`language` is given and it is not included in
:code:`supported_languages`.
"""
if language and language not in self.supported_languages:
raise ValueError(
'Language {} is not supported by NLAPI segmenter'.format(language))
chunks, language = self._get_source_chunks(source, language=language)
if self.use_entity:
entities = self._get_entities(source, language=language)
chunks = self._group_chunks_by_entities(chunks, entities)
chunks.resolve_dependencies()
return chunks
def _get_source_chunks(self, input_text, language=None):
"""Returns a chunk list retrieved from Syntax Analysis results.
Args:
input_text (str): Text to annotate.
language (str, optional): Language of the text.
Returns:
A chunk list. (:obj:`budou.chunk.ChunkList`)
"""
chunks = ChunkList()
seek = 0
result = self._get_annotations(input_text, language=language)
tokens = result['tokens']
language = result['language']
for i, token in enumerate(tokens):
word = token['text']['content']
begin_offset = token['text']['beginOffset']
label = token['dependencyEdge']['label']
pos = token['partOfSpeech']['tag']
if begin_offset > seek:
chunks.append(Chunk.space())
seek = begin_offset
chunk = Chunk(word, pos, label)
if chunk.label in _DEPENDENT_LABEL:
# Determining concatenating direction based on syntax dependency.
chunk.dependency = i < token['dependencyEdge']['headTokenIndex']
if chunk.is_punct():
chunk.dependency = chunk.is_open_punct()
chunks.append(chunk)
seek += len(word)
return chunks, language
def _group_chunks_by_entities(self, chunks, entities):
"""Groups chunks by entities retrieved from NL API Entity Analysis.
Args:
chunks (:obj:`budou.chunk.ChunkList`): List of chunks to be processed.
entities (:obj:`list` of :obj:`dict`): List of entities.
Returns:
A chunk list. (:obj:`budou.chunk.ChunkList`)
"""
for entity in entities:
chunks_to_concat = chunks.get_overlaps(
entity['beginOffset'], len(entity['content']))
if not chunks_to_concat:
continue
new_chunk_word = u''.join([chunk.word for chunk in chunks_to_concat])
new_chunk = Chunk(new_chunk_word)
chunks.swap(chunks_to_concat, new_chunk)
return chunks
@_memorize
def _get_annotations(self, text, language=''):
"""Returns the list of annotations retrieved from the given text.
Args:
text (str): Input text.
language (str, optional): Language code.
Returns:
Results in a dictionary. :code:`tokens` contains the list of annotations
and :code:`language` contains the inferred language from the input.
"""
body = {
'document': {
'type': 'PLAIN_TEXT',
'content': text,
},
'features': {
'extract_syntax': True,
},
'encodingType': 'UTF32',
}
if language:
body['document']['language'] = language
request = self.service.documents().annotateText(body=body)
response = request.execute()
tokens = response.get('tokens', [])
language = response.get('language')
return {'tokens': tokens, 'language': language}
@_memorize
def _get_entities(self, text, language=''):
"""Returns the list of entities retrieved from the given text.
Args:
text (str): Input text.
language (str, optional): Language code.
Returns:
List of entities.
"""
body = {
'document': {
'type': 'PLAIN_TEXT',
'content': text,
},
'encodingType': 'UTF32',
}
if language:
body['document']['language'] = language
request = self.service.documents().analyzeEntities(body=body)
response = request.execute()
result = []
for entity in response.get('entities', []):
mentions = entity.get('mentions', [])
if not mentions:
continue
entity_text = mentions[0]['text']
offset = entity_text['beginOffset']
for word in entity_text['content'].split():
result.append({'content': word, 'beginOffset': offset})
offset += len(word)
return result