Initial commit
This commit is contained in:
commit
014d7ecda0
11 changed files with 1215 additions and 0 deletions
1
pybergamot/__init__.py
Normal file
1
pybergamot/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
from .translator import Translator
|
106
pybergamot/engine.py
Normal file
106
pybergamot/engine.py
Normal file
|
@ -0,0 +1,106 @@
|
|||
"""
|
||||
pybergamot - (Somewhat) stable interface for the **Bergamot Translation Engine Python Bindings**.
|
||||
Copyright (C) 2023 Ad5001
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Meta repositories for translation engines
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from bergamot import Service, VectorString, ResponseOptions, VectorResponse, TranslationModel
|
||||
|
||||
|
||||
class Engine(ABC):
|
||||
"""
|
||||
An interface for multiple types of translation engine.
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def source_lang(self) -> str:
|
||||
"""
|
||||
Two-char ISO languages for which the engine translates from.
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def target_lang(self) -> str:
|
||||
"""
|
||||
Two-char ISO languages for which the engine translates to.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def translate(self, text: str, html: bool = False, alignment: bool = False, quality_scores: bool = False) -> str:
|
||||
"""
|
||||
Translates the text from the engine's source lang its target.
|
||||
:param text: Text to translate.
|
||||
:param html: Set to True if the text contains an HTML structure which needs to
|
||||
be preserved while translated.
|
||||
:param alignment: Toggle for alignment.
|
||||
:param quality_scores: Toggle for whether to include the translation's quality scores
|
||||
for each word in HTML format.
|
||||
:return: The translated text.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DirectBergamotModelEngine(Engine):
|
||||
def __init__(self, source_lang: str, target_lang: str, model: TranslationModel,
|
||||
service: Service):
|
||||
self._source_lang = source_lang
|
||||
self._target_lang = target_lang
|
||||
self.model = model
|
||||
self.service = service
|
||||
|
||||
@property
|
||||
def source_lang(self) -> str:
|
||||
return self._source_lang
|
||||
|
||||
@property
|
||||
def target_lang(self) -> str:
|
||||
return self._target_lang
|
||||
|
||||
def translate(self, text: str, html: bool = False, alignment: bool = False, quality_scores: bool = False) -> str:
|
||||
opts = ResponseOptions(
|
||||
alignment=alignment, qualityScores=quality_scores, HTML=html
|
||||
)
|
||||
resp: VectorResponse = self.service.translate(self.model, VectorString([text]), opts)
|
||||
return resp[0].target.text
|
||||
|
||||
|
||||
class ChainBergamotModelsEngine(Engine):
|
||||
def __init__(self, source_lang: str, target_lang: str, model1: TranslationModel,
|
||||
model2: TranslationModel, service: Service):
|
||||
self._source_lang = source_lang
|
||||
self._target_lang = target_lang
|
||||
self.model1 = model1
|
||||
self.model2 = model2
|
||||
self.service = service
|
||||
|
||||
@property
|
||||
def source_lang(self) -> str:
|
||||
return self._source_lang
|
||||
|
||||
@property
|
||||
def target_lang(self) -> str:
|
||||
return self._target_lang
|
||||
|
||||
def translate(self, text: str, html: bool = False, alignment: bool = False, quality_scores: bool = False) -> str:
|
||||
opts = ResponseOptions(
|
||||
alignment=alignment, qualityScores=quality_scores, HTML=html
|
||||
)
|
||||
resp: VectorResponse = self.service.pivot(self.model1, self.model2, VectorString([text]), opts)
|
||||
return resp[0].target.text
|
119
pybergamot/models.py
Normal file
119
pybergamot/models.py
Normal file
|
@ -0,0 +1,119 @@
|
|||
"""
|
||||
pybergamot - (Somewhat) stable interface for the **Bergamot Translation Engine Python Bindings**.
|
||||
Copyright (C) 2023 Ad5001
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
Lists all repositories and connects them to default language.
|
||||
"""
|
||||
|
||||
from bergamot import REPOSITORY
|
||||
from languagecodes import iso_639_alpha2
|
||||
|
||||
|
||||
class Models:
|
||||
REPO_FOR_MODEL = {}
|
||||
AVAILABLE = []
|
||||
INSTALLED = []
|
||||
LANGS = []
|
||||
|
||||
@staticmethod
|
||||
def update_models_list() -> None:
|
||||
"""
|
||||
Imports the list of models from all repositories.
|
||||
"""
|
||||
Models.REPO_FOR_MODEL = {model_name: repo
|
||||
for repo in REPOSITORY.repositories
|
||||
for model_name in REPOSITORY.repositories[repo].models(False)}
|
||||
Models.AVAILABLE = [model_name
|
||||
for repo in REPOSITORY.repositories
|
||||
for model_name in REPOSITORY.repositories[repo].models(False)]
|
||||
Models.INSTALLED = [model_name
|
||||
for repo in REPOSITORY.repositories
|
||||
for model_name in REPOSITORY.repositories[repo].models(True)]
|
||||
Models.LANGS = []
|
||||
for model_name in Models.AVAILABLE:
|
||||
lang1, lang2 = Models.get_model_languages(model_name)
|
||||
if lang1 not in Models.LANGS:
|
||||
Models.LANGS.append(lang1)
|
||||
if lang2 not in Models.LANGS:
|
||||
Models.LANGS.append(lang2)
|
||||
|
||||
@staticmethod
|
||||
def update_repositories_cache() -> None:
|
||||
"""
|
||||
Fetches the online models list for every repository.
|
||||
"""
|
||||
for repo in REPOSITORY.repositories:
|
||||
REPOSITORY.repositories[repo].update()
|
||||
Models.update_models_list()
|
||||
|
||||
@staticmethod
|
||||
def get_model_languages(model_name: str) -> tuple:
|
||||
"""
|
||||
Returns a tuple of two two-char ISO language name which the model translates from and to.
|
||||
:param model_name: Name of the model
|
||||
:raises:
|
||||
ValueError: When the model_name doesn't exist.
|
||||
:return: (from language, to language)
|
||||
"""
|
||||
if model_name not in Models.AVAILABLE:
|
||||
raise ValueError(f"Model {model_name} does not exist. Did you update the repository cache?")
|
||||
model = REPOSITORY.model(Models.REPO_FOR_MODEL[model_name], model_name)
|
||||
src, target, tiny = model['code'].split("-")
|
||||
if len(src) == 3:
|
||||
src = iso_639_alpha2(src)
|
||||
if len(target) == 3:
|
||||
target = iso_639_alpha2(target)
|
||||
return src, target
|
||||
|
||||
@staticmethod
|
||||
def get_model_name_for_languages(source_lang: str, target_lang: str) -> str | None:
|
||||
"""
|
||||
Finds a model which translates source_lang into target_lang.
|
||||
:param source_lang: Language to translate from.
|
||||
:param target_lang: Language to translate to.
|
||||
:return: None if no model was found, name of the model otherwise.
|
||||
"""
|
||||
lang_tuple = (source_lang, target_lang)
|
||||
names = list(filter(lambda name: lang_tuple == Models.get_model_languages(name), Models.AVAILABLE))
|
||||
if len(names) > 0:
|
||||
model_name = names[0]
|
||||
else:
|
||||
model_name = None
|
||||
return model_name
|
||||
|
||||
@staticmethod
|
||||
def download(model_name: str) -> None:
|
||||
"""
|
||||
Downloads or updates the given model.
|
||||
:param model_name: Name of the model to download.
|
||||
:raises:
|
||||
ValueError: When the model_name doesn't exist.
|
||||
"""
|
||||
if model_name not in Models.AVAILABLE:
|
||||
raise ValueError(f"Model {model_name} does not exist. Did you update the repository cache?")
|
||||
REPOSITORY.download(Models.REPO_FOR_MODEL[model_name], model_name)
|
||||
|
||||
@staticmethod
|
||||
def update_all_models() -> None:
|
||||
"""
|
||||
Updates all already downloaded models to their latest versions.
|
||||
"""
|
||||
for model_name in Models.INSTALLED:
|
||||
REPOSITORY.download(Models.REPO_FOR_MODEL[model_name], model_name)
|
||||
|
||||
|
||||
Models.update_models_list()
|
155
pybergamot/translator.py
Normal file
155
pybergamot/translator.py
Normal file
|
@ -0,0 +1,155 @@
|
|||
"""
|
||||
pybergamot - (Somewhat) stable interface for the **Bergamot Translation Engine Python Bindings**.
|
||||
Copyright (C) 2023 Ad5001
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
"""
|
||||
|
||||
from bergamot import REPOSITORY, TranslationModel, Service, ServiceConfig
|
||||
from warnings import warn
|
||||
|
||||
from .models import Models
|
||||
from .engine import Engine, DirectBergamotModelEngine, ChainBergamotModelsEngine
|
||||
|
||||
|
||||
class Translator:
|
||||
"""
|
||||
Main exposed class to provide translation using Bergamot.
|
||||
Workflow goes as follows:
|
||||
1. Create instance
|
||||
2. Load languages
|
||||
3. Use translation between any of the loaded language.
|
||||
|
||||
"""
|
||||
def __init__(self, workers_count = 1, cache_size = 0, log_level = 'off'):
|
||||
"""
|
||||
Creates a Translator instance.
|
||||
:param workers_count: Number of workers which can be used at once.
|
||||
:param cache_size: Size of the cache used in bergamot..
|
||||
:param log_level: Level of logs used in bergamot.
|
||||
"""
|
||||
self.loaded_languages = []
|
||||
self._loaded_engines = {}
|
||||
config = ServiceConfig(numWorkers=workers_count, cacheSize=cache_size, logLevel=log_level)
|
||||
self.service = Service(config)
|
||||
|
||||
def _load_model(self, model_name: str, download: bool = True) -> TranslationModel:
|
||||
"""
|
||||
Loads a tiny model by its name, downloads it if it doesn't exist.
|
||||
:param model_name: Name of the model to load.
|
||||
:param download: If a model does not exist locally, if True, download it,
|
||||
otherwise emit an error.
|
||||
:raises:
|
||||
ValueError: If the provided model does not exist.
|
||||
EnvironmentError: When a model is unavailable and download has been set to false.
|
||||
:return: Bergamot translation model instance.
|
||||
"""
|
||||
if model_name not in Models.AVAILABLE:
|
||||
raise ValueError(f"Model {model_name} not available.")
|
||||
# Check if the model needs to be downloaded.
|
||||
if model_name not in Models.INSTALLED:
|
||||
if download:
|
||||
Models.download(model_name)
|
||||
else:
|
||||
langs = Models.get_model_languages(model_name)
|
||||
raise EnvironmentError(f"Translation model from {langs[0]} to {langs[1]} is not installed locally.")
|
||||
# Create model
|
||||
model_path = REPOSITORY.modelConfigPath(Models.REPO_FOR_MODEL[model_name], model_name)
|
||||
return self.service.modelFromConfigPath(model_path)
|
||||
|
||||
def _create_engine(self, source_lang: str, target_lang: str, download: bool = True) -> Engine:
|
||||
"""
|
||||
Creates an Engine to translate a source lang to a target lang.
|
||||
:param source_lang: Language to translate from.
|
||||
:param target_lang: Language to translate to.
|
||||
:param download: If a model does not exist locally, if True, download it,
|
||||
otherwise emit an error.
|
||||
:raises:
|
||||
ValueError: If a model from a lang to english does not exist.
|
||||
EnvironmentError: When a model is unavailable and download has been set to false.
|
||||
:return: Engine instance.
|
||||
"""
|
||||
direct_model_name = Models.get_model_name_for_languages(source_lang, target_lang)
|
||||
if direct_model_name is not None and (download or direct_model_name in Models.INSTALLED):
|
||||
# Direct model exists, and is installed locally if option download is disabled.
|
||||
engine = DirectBergamotModelEngine(
|
||||
source_lang, target_lang, self._load_model(direct_model_name, download), self.service
|
||||
)
|
||||
else:
|
||||
# Use chain models with English as intermediary.
|
||||
model1 = Models.get_model_name_for_languages(source_lang, "en")
|
||||
model2 = Models.get_model_name_for_languages("en", target_lang)
|
||||
if model1 is None:
|
||||
raise ValueError(f"Missing translation models between English and {source_lang}.")
|
||||
if model2 is None:
|
||||
raise ValueError(f"Missing translation models between English and {target_lang}.")
|
||||
# Create the engine
|
||||
engine = ChainBergamotModelsEngine(
|
||||
source_lang, target_lang,
|
||||
self._load_model(model1, download), self._load_model(model2, download),
|
||||
self.service
|
||||
)
|
||||
return engine
|
||||
|
||||
def load(self, lang: str, download: bool = True) -> None:
|
||||
"""
|
||||
Loads a language code and all the associated models (for already added languages)
|
||||
into the translator.
|
||||
|
||||
:param lang: Two-char ISO language name.
|
||||
:param download: If a model does not exist locally, if True, download it,
|
||||
otherwise emit an error.
|
||||
:raises:
|
||||
ValueError: If a model from a lang to english does not exist.
|
||||
EnvironmentError: When a model is unavailable and download has been set to false.
|
||||
"""
|
||||
if lang not in Models.LANGS:
|
||||
raise ValueError(f"Language {lang} does not exist.")
|
||||
if lang in self.loaded_languages:
|
||||
warn(f"Language {lang} has already been imported.", RuntimeWarning)
|
||||
else:
|
||||
# Register language
|
||||
self._loaded_engines[lang] = {}
|
||||
# Find whether there is a direct model for translating with other loaded language
|
||||
# or we need to use a pivot
|
||||
for other_lang in self.loaded_languages:
|
||||
forward_engine = self._create_engine(lang, other_lang, download)
|
||||
backward_engine = self._create_engine(other_lang, lang, download)
|
||||
self._loaded_engines[lang][other_lang] = forward_engine
|
||||
self._loaded_engines[other_lang][lang] = backward_engine
|
||||
# Register language
|
||||
self.loaded_languages.append(lang)
|
||||
|
||||
def translate(self, source_lang: str, target_lang: str, text: str,
|
||||
html: bool = False, alignment: bool = False, quality_scores: bool = False) -> str:
|
||||
"""
|
||||
Translates a text from a source lang to a target lang.
|
||||
|
||||
:param source_lang: Language to translate from.
|
||||
:param target_lang: Language to translate to.
|
||||
:param text: Text to translate.
|
||||
:param html: Set to True if the text contains an HTML structure which needs to
|
||||
be preserved while translated.
|
||||
:param alignment: Toggle for alignment.
|
||||
:param quality_scores: Toggle for whether to include the translation's quality scores
|
||||
for each word in HTML format.
|
||||
:raises:
|
||||
ValueError: Either source_lang or target_lang haven't been loaded yet.
|
||||
:return: The translated text.
|
||||
"""
|
||||
if source_lang not in self.loaded_languages:
|
||||
raise ValueError(f"Language {source_lang} is not loaded. Use the load() function first.")
|
||||
if target_lang not in self.loaded_languages:
|
||||
raise ValueError(f"Language {target_lang} is not loaded. Use the load() function first.")
|
||||
return self._loaded_engines[source_lang][target_lang].translate(text, html, alignment, quality_scores)
|
Loading…
Add table
Add a link
Reference in a new issue