Initial commit

This commit is contained in:
Adsooi 2023-11-03 00:36:46 +01:00
commit 014d7ecda0
Signed by: Ad5001
GPG key ID: EF45F9C6AFE20160
11 changed files with 1215 additions and 0 deletions

1
pybergamot/__init__.py Normal file
View file

@ -0,0 +1 @@
from .translator import Translator

106
pybergamot/engine.py Normal file
View file

@ -0,0 +1,106 @@
"""
pybergamot - (Somewhat) stable interface for the **Bergamot Translation Engine Python Bindings**.
Copyright (C) 2023 Ad5001
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Meta repositories for translation engines
"""
from abc import ABC, abstractmethod
from bergamot import Service, VectorString, ResponseOptions, VectorResponse, TranslationModel
class Engine(ABC):
"""
An interface for multiple types of translation engine.
"""
@property
@abstractmethod
def source_lang(self) -> str:
"""
Two-char ISO languages for which the engine translates from.
"""
pass
@property
@abstractmethod
def target_lang(self) -> str:
"""
Two-char ISO languages for which the engine translates to.
"""
pass
@abstractmethod
def translate(self, text: str, html: bool = False, alignment: bool = False, quality_scores: bool = False) -> str:
"""
Translates the text from the engine's source lang its target.
:param text: Text to translate.
:param html: Set to True if the text contains an HTML structure which needs to
be preserved while translated.
:param alignment: Toggle for alignment.
:param quality_scores: Toggle for whether to include the translation's quality scores
for each word in HTML format.
:return: The translated text.
"""
pass
class DirectBergamotModelEngine(Engine):
def __init__(self, source_lang: str, target_lang: str, model: TranslationModel,
service: Service):
self._source_lang = source_lang
self._target_lang = target_lang
self.model = model
self.service = service
@property
def source_lang(self) -> str:
return self._source_lang
@property
def target_lang(self) -> str:
return self._target_lang
def translate(self, text: str, html: bool = False, alignment: bool = False, quality_scores: bool = False) -> str:
opts = ResponseOptions(
alignment=alignment, qualityScores=quality_scores, HTML=html
)
resp: VectorResponse = self.service.translate(self.model, VectorString([text]), opts)
return resp[0].target.text
class ChainBergamotModelsEngine(Engine):
def __init__(self, source_lang: str, target_lang: str, model1: TranslationModel,
model2: TranslationModel, service: Service):
self._source_lang = source_lang
self._target_lang = target_lang
self.model1 = model1
self.model2 = model2
self.service = service
@property
def source_lang(self) -> str:
return self._source_lang
@property
def target_lang(self) -> str:
return self._target_lang
def translate(self, text: str, html: bool = False, alignment: bool = False, quality_scores: bool = False) -> str:
opts = ResponseOptions(
alignment=alignment, qualityScores=quality_scores, HTML=html
)
resp: VectorResponse = self.service.pivot(self.model1, self.model2, VectorString([text]), opts)
return resp[0].target.text

119
pybergamot/models.py Normal file
View file

@ -0,0 +1,119 @@
"""
pybergamot - (Somewhat) stable interface for the **Bergamot Translation Engine Python Bindings**.
Copyright (C) 2023 Ad5001
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
Lists all repositories and connects them to default language.
"""
from bergamot import REPOSITORY
from languagecodes import iso_639_alpha2
class Models:
REPO_FOR_MODEL = {}
AVAILABLE = []
INSTALLED = []
LANGS = []
@staticmethod
def update_models_list() -> None:
"""
Imports the list of models from all repositories.
"""
Models.REPO_FOR_MODEL = {model_name: repo
for repo in REPOSITORY.repositories
for model_name in REPOSITORY.repositories[repo].models(False)}
Models.AVAILABLE = [model_name
for repo in REPOSITORY.repositories
for model_name in REPOSITORY.repositories[repo].models(False)]
Models.INSTALLED = [model_name
for repo in REPOSITORY.repositories
for model_name in REPOSITORY.repositories[repo].models(True)]
Models.LANGS = []
for model_name in Models.AVAILABLE:
lang1, lang2 = Models.get_model_languages(model_name)
if lang1 not in Models.LANGS:
Models.LANGS.append(lang1)
if lang2 not in Models.LANGS:
Models.LANGS.append(lang2)
@staticmethod
def update_repositories_cache() -> None:
"""
Fetches the online models list for every repository.
"""
for repo in REPOSITORY.repositories:
REPOSITORY.repositories[repo].update()
Models.update_models_list()
@staticmethod
def get_model_languages(model_name: str) -> tuple:
"""
Returns a tuple of two two-char ISO language name which the model translates from and to.
:param model_name: Name of the model
:raises:
ValueError: When the model_name doesn't exist.
:return: (from language, to language)
"""
if model_name not in Models.AVAILABLE:
raise ValueError(f"Model {model_name} does not exist. Did you update the repository cache?")
model = REPOSITORY.model(Models.REPO_FOR_MODEL[model_name], model_name)
src, target, tiny = model['code'].split("-")
if len(src) == 3:
src = iso_639_alpha2(src)
if len(target) == 3:
target = iso_639_alpha2(target)
return src, target
@staticmethod
def get_model_name_for_languages(source_lang: str, target_lang: str) -> str | None:
"""
Finds a model which translates source_lang into target_lang.
:param source_lang: Language to translate from.
:param target_lang: Language to translate to.
:return: None if no model was found, name of the model otherwise.
"""
lang_tuple = (source_lang, target_lang)
names = list(filter(lambda name: lang_tuple == Models.get_model_languages(name), Models.AVAILABLE))
if len(names) > 0:
model_name = names[0]
else:
model_name = None
return model_name
@staticmethod
def download(model_name: str) -> None:
"""
Downloads or updates the given model.
:param model_name: Name of the model to download.
:raises:
ValueError: When the model_name doesn't exist.
"""
if model_name not in Models.AVAILABLE:
raise ValueError(f"Model {model_name} does not exist. Did you update the repository cache?")
REPOSITORY.download(Models.REPO_FOR_MODEL[model_name], model_name)
@staticmethod
def update_all_models() -> None:
"""
Updates all already downloaded models to their latest versions.
"""
for model_name in Models.INSTALLED:
REPOSITORY.download(Models.REPO_FOR_MODEL[model_name], model_name)
Models.update_models_list()

155
pybergamot/translator.py Normal file
View file

@ -0,0 +1,155 @@
"""
pybergamot - (Somewhat) stable interface for the **Bergamot Translation Engine Python Bindings**.
Copyright (C) 2023 Ad5001
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from bergamot import REPOSITORY, TranslationModel, Service, ServiceConfig
from warnings import warn
from .models import Models
from .engine import Engine, DirectBergamotModelEngine, ChainBergamotModelsEngine
class Translator:
"""
Main exposed class to provide translation using Bergamot.
Workflow goes as follows:
1. Create instance
2. Load languages
3. Use translation between any of the loaded language.
"""
def __init__(self, workers_count = 1, cache_size = 0, log_level = 'off'):
"""
Creates a Translator instance.
:param workers_count: Number of workers which can be used at once.
:param cache_size: Size of the cache used in bergamot..
:param log_level: Level of logs used in bergamot.
"""
self.loaded_languages = []
self._loaded_engines = {}
config = ServiceConfig(numWorkers=workers_count, cacheSize=cache_size, logLevel=log_level)
self.service = Service(config)
def _load_model(self, model_name: str, download: bool = True) -> TranslationModel:
"""
Loads a tiny model by its name, downloads it if it doesn't exist.
:param model_name: Name of the model to load.
:param download: If a model does not exist locally, if True, download it,
otherwise emit an error.
:raises:
ValueError: If the provided model does not exist.
EnvironmentError: When a model is unavailable and download has been set to false.
:return: Bergamot translation model instance.
"""
if model_name not in Models.AVAILABLE:
raise ValueError(f"Model {model_name} not available.")
# Check if the model needs to be downloaded.
if model_name not in Models.INSTALLED:
if download:
Models.download(model_name)
else:
langs = Models.get_model_languages(model_name)
raise EnvironmentError(f"Translation model from {langs[0]} to {langs[1]} is not installed locally.")
# Create model
model_path = REPOSITORY.modelConfigPath(Models.REPO_FOR_MODEL[model_name], model_name)
return self.service.modelFromConfigPath(model_path)
def _create_engine(self, source_lang: str, target_lang: str, download: bool = True) -> Engine:
"""
Creates an Engine to translate a source lang to a target lang.
:param source_lang: Language to translate from.
:param target_lang: Language to translate to.
:param download: If a model does not exist locally, if True, download it,
otherwise emit an error.
:raises:
ValueError: If a model from a lang to english does not exist.
EnvironmentError: When a model is unavailable and download has been set to false.
:return: Engine instance.
"""
direct_model_name = Models.get_model_name_for_languages(source_lang, target_lang)
if direct_model_name is not None and (download or direct_model_name in Models.INSTALLED):
# Direct model exists, and is installed locally if option download is disabled.
engine = DirectBergamotModelEngine(
source_lang, target_lang, self._load_model(direct_model_name, download), self.service
)
else:
# Use chain models with English as intermediary.
model1 = Models.get_model_name_for_languages(source_lang, "en")
model2 = Models.get_model_name_for_languages("en", target_lang)
if model1 is None:
raise ValueError(f"Missing translation models between English and {source_lang}.")
if model2 is None:
raise ValueError(f"Missing translation models between English and {target_lang}.")
# Create the engine
engine = ChainBergamotModelsEngine(
source_lang, target_lang,
self._load_model(model1, download), self._load_model(model2, download),
self.service
)
return engine
def load(self, lang: str, download: bool = True) -> None:
"""
Loads a language code and all the associated models (for already added languages)
into the translator.
:param lang: Two-char ISO language name.
:param download: If a model does not exist locally, if True, download it,
otherwise emit an error.
:raises:
ValueError: If a model from a lang to english does not exist.
EnvironmentError: When a model is unavailable and download has been set to false.
"""
if lang not in Models.LANGS:
raise ValueError(f"Language {lang} does not exist.")
if lang in self.loaded_languages:
warn(f"Language {lang} has already been imported.", RuntimeWarning)
else:
# Register language
self._loaded_engines[lang] = {}
# Find whether there is a direct model for translating with other loaded language
# or we need to use a pivot
for other_lang in self.loaded_languages:
forward_engine = self._create_engine(lang, other_lang, download)
backward_engine = self._create_engine(other_lang, lang, download)
self._loaded_engines[lang][other_lang] = forward_engine
self._loaded_engines[other_lang][lang] = backward_engine
# Register language
self.loaded_languages.append(lang)
def translate(self, source_lang: str, target_lang: str, text: str,
html: bool = False, alignment: bool = False, quality_scores: bool = False) -> str:
"""
Translates a text from a source lang to a target lang.
:param source_lang: Language to translate from.
:param target_lang: Language to translate to.
:param text: Text to translate.
:param html: Set to True if the text contains an HTML structure which needs to
be preserved while translated.
:param alignment: Toggle for alignment.
:param quality_scores: Toggle for whether to include the translation's quality scores
for each word in HTML format.
:raises:
ValueError: Either source_lang or target_lang haven't been loaded yet.
:return: The translated text.
"""
if source_lang not in self.loaded_languages:
raise ValueError(f"Language {source_lang} is not loaded. Use the load() function first.")
if target_lang not in self.loaded_languages:
raise ValueError(f"Language {target_lang} is not loaded. Use the load() function first.")
return self._loaded_engines[source_lang][target_lang].translate(text, html, alignment, quality_scores)