from __future__ import division
from .base import io as _base_io
from . import decoder as _dec
from . import fstext as _fst
from .fstext import utils as _fst_utils
from .gmm import am as _gmm_am
from . import hmm as _hmm
from .lat import align as _lat_align
from .lat import functions as _lat_funcs
from .matrix import _kaldi_matrix
from . import nnet3 as _nnet3
from . import tree as _tree
from .util import io as _util_io
__all__ = ['Aligner', 'MappedAligner', 'GmmAligner', 'NnetAligner']
[docs]class Aligner(object):
"""Speech aligner.
This can be used to align transition-id log-likelihood matrices with
reference texts.
Args:
transition_model (TransitionModel): The transition model.
tree (ContextDependency): The phonetic decision tree.
lexicon (StdFst): The lexicon FST.
symbols (SymbolTable): The symbol table. If provided, "text" output of
:meth:`decode` includes symbols instead of integer indices.
disambig_symbols (List[int]): Disambiguation symbols.
graph_compiler_opts (TrainingGraphCompilerOptions): Configuration
options for graph compiler.
beam (float): Decoding beam used in alignment.
transition_scale (float): The scale on non-self-loop transition
probabilities.
self_loop_scale (float): The scale on self-loop transition
probabilities.
acoustic_scale (float): Acoustic score scale.
"""
def __init__(self, transition_model, tree, lexicon, symbols=None,
disambig_symbols=None, graph_compiler_opts=None, beam=200.0,
transition_scale=1.0, self_loop_scale=1.0, acoustic_scale=0.1):
self.transition_model = transition_model
self.symbols = symbols
if not graph_compiler_opts:
graph_compiler_opts = _dec.TrainingGraphCompilerOptions()
self.graph_compiler = _dec.TrainingGraphCompiler(
transition_model, tree, lexicon,
disambig_symbols, graph_compiler_opts)
self.decoder_opts = _dec.FasterDecoderOptions()
self.decoder_opts.beam = beam
self.transition_scale = transition_scale
self.self_loop_scale = self_loop_scale
self.acoustic_scale = acoustic_scale
[docs] @staticmethod
def read_tree(tree_rxfilename):
"""Reads phonetic decision tree from an extended filename.
Returns:
ContextDependency: Phonetic decision tree.
"""
tree = _tree.ContextDependency()
with _util_io.xopen(tree_rxfilename) as ki:
tree.read(ki.stream(), ki.binary)
return tree
[docs] @staticmethod
def read_lexicon(lexicon_rxfilename):
"""Reads lexicon FST from an extended filename.
Returns:
StdFst: Lexicon FST.
"""
return _fst.read_fst_kaldi(lexicon_rxfilename)
[docs] @staticmethod
def read_symbols(symbols_filename):
"""Reads symbol table from file.
Returns:
SymbolTable: Symbol table.
"""
if symbols_filename is None:
return None
else:
return _fst.SymbolTable.read_text(symbols_filename)
[docs] @staticmethod
def read_disambig_symbols(disambig_rxfilename):
"""Reads disambiguation symbols from an extended filename.
Returns:
List[int]: List of disambiguation symbols.
"""
if disambig_rxfilename is None:
return None
else:
with _util_io.xopen(disambig_rxfilename, "rt") as ki:
return [int(line.strip()) for line in ki]
[docs] @staticmethod
def read_model(model_rxfilename):
"""Reads transition model from an extended filename.
Returns:
TransitionModel: Transition model.
"""
with _util_io.xopen(model_rxfilename) as ki:
return _hmm.TransitionModel().read(ki.stream(), ki.binary)
[docs] @classmethod
def from_files(cls, model_rxfilename, tree_rxfilename, lexicon_rxfilename,
symbols_filename=None, disambig_rxfilename=None,
graph_compiler_opts=None, beam=200.0, transition_scale=1.0,
self_loop_scale=1.0, acoustic_scale=0.1):
"""Constructs a new GMM aligner from given files.
Args:
model_rxfilename (str): Extended filename for reading the transition
model.
tree_rxfilename (str): Extended filename for reading the phonetic
decision tree.
lexicon_rxfilename (str): Extended filename for reading the lexicon
FST.
symbols_filename (str): The symbols file. If provided, "text" input
of :meth:`align` should include symbols instead of integer
indices.
disambig_rxfilename (str): Extended filename for reading the list
of disambiguation symbols.
graph_compiler_opts (TrainingGraphCompilerOptions): Configuration
options for graph compiler.
beam (float): Decoding beam used in alignment.
transition_scale (float): The scale on non-self-loop transition
probabilities.
self_loop_scale (float): The scale on self-loop transition
probabilities.
acoustic_scale (float): Acoustic score scale.
Returns:
A new aligner object.
"""
transition_model = cls.read_model(model_rxfilename)
tree = cls.read_tree(tree_rxfilename)
lexicon = cls.read_lexicon(lexicon_rxfilename)
symbols = cls.read_symbols(symbols_filename)
disambig_symbols = cls.read_disambig_symbols(disambig_rxfilename)
return cls(transition_model, tree, lexicon, symbols,
disambig_symbols, graph_compiler_opts, beam,
transition_scale, self_loop_scale, acoustic_scale)
def _make_decodable(self, loglikes):
"""Constructs a new decodable object from input log-likelihoods.
Args:
loglikes (object): Input log-likelihoods.
Returns:
DecodableMatrixScaled: A decodable object for computing scaled
log-likelihoods.
"""
if loglikes.num_rows == 0:
raise ValueError("Empty loglikes matrix.")
return _dec.DecodableMatrixScaled(loglikes, self.acoustic_scale)
[docs] def align(self, input, text):
"""Aligns input with text.
Output is a dictionary with the following `(key, value)` pairs:
================ =========================== ===========================
key value value type
================ =========================== ===========================
"alignment" Frame-level alignment `List[int]`
"best_path" Best lattice path `CompactLattice`
"likelihood" Log-likelihood of best path `float`
"weight" Cost of best path `LatticeWeight`
================ =========================== ===========================
If :attr:`symbols` is ``None``, the "text" input should be a
string of space separated integer indices. Otherwise it should be a
string of space separated symbols. The "weight" output is a lattice
weight consisting of (graph-score, acoustic-score).
Args:
input (object): Input to align.
text (str): Reference text to align.
Returns:
A dictionary representing alignment output.
Raises:
RuntimeError: If alignment fails.
"""
if self.symbols:
words = _fst.symbols_to_indices(self.symbols, text.split())
else:
words = text.split()
graph = self.graph_compiler.compile_graph_from_text(words)
_hmm.add_transition_probs(self.transition_model, [],
self.transition_scale, self.self_loop_scale,
graph)
decoder = _dec.FasterDecoder(graph, self.decoder_opts)
decoder.decode(self._make_decodable(input))
if not decoder.reached_final():
raise RuntimeError("No final state was active on the last frame.")
try:
best_path = decoder.get_best_path()
except RuntimeError:
raise RuntimeError("Empty alignment output.")
ali, _, weight = _fst_utils.get_linear_symbol_sequence(best_path)
likelihood = - (weight.value1 + weight.value2)
if self.acoustic_scale != 0.0:
scale = _fst_utils.acoustic_lattice_scale(1.0 / self.acoustic_scale)
_fst_utils.scale_lattice(scale, best_path)
best_path = _fst_utils.convert_lattice_to_compact_lattice(best_path)
return {
"alignment": ali,
"best_path": best_path,
"likelihood": likelihood,
"weight": weight
}
[docs] def to_phone_alignment(self, alignment, phones=None):
"""Converts frame-level alignment to phone-level alignment.
Args:
alignment (List[int]): Frame-level alignment.
phones (SymbolTable): The phone symbol table. If provided, output
includes symbols instead of integer indices.
Returns:
List[Tuple[int,int,int]]: A list of triplets representing, for
each phone in the input, the phone index/symbol, the begin time (in
frames) and the duration (in frames).
"""
success, split_ali = _hmm.split_to_phones(self.transition_model,
alignment)
if not success:
raise RuntimeError("Alignment phone split failed.")
phone_start, phone_alignment = 0, []
for entry in split_ali:
phone = self.transition_model.transition_id_to_phone(entry[0])
if phones:
phone = phones.find_symbol(phone)
phone_duration = len(entry)
phone_alignment.append((phone, phone_start, phone_duration))
phone_start += phone_duration
return phone_alignment
[docs] def to_word_alignment(self, best_path, word_boundary_info):
"""Converts best alignment path to word-level alignment.
Args:
best_path (CompactLattice): Best alignment path.
word_boundary_info (WordBoundaryInfo): Word boundary information.
Returns:
List[Tuple[int,int,int]]: A list of triplets representing, for each
word in the input, the word index/symbol, the begin time (in frames)
and the duration (in frames). The zero/epsilon words correspond to
optional silences.
"""
success, best_path = _lat_align.word_align_lattice(
best_path, self.transition_model, word_boundary_info, 0)
if not success:
raise RuntimeError("Lattice word alignment failed.")
word_alignment = _lat_funcs.compact_lattice_to_word_alignment(best_path)
if self.symbols:
mapper = lambda x: (self.symbols.find_symbol(x[0]), x[1], x[2])
else:
mapper = lambda x: x
return list(map(mapper, zip(*word_alignment)))
[docs]class MappedAligner(Aligner):
"""Mapped speech aligner.
This can be used to align phone-id log-likelihood matrices with reference
texts.
Args:
transition_model (TransitionModel): The transition model.
tree (ContextDependency): The phonetic decision tree.
lexicon (StdFst): The lexicon FST.
symbols (SymbolTable): The symbol table. If provided, "text" output of
:meth:`decode` includes symbols instead of integer indices.
disambig_symbols (List[int]): Disambiguation symbols.
graph_compiler_opts (TrainingGraphCompilerOptions): Configuration
options for graph compiler.
beam (float): Decoding beam used in alignment.
transition_scale (float): The scale on non-self-loop transition
probabilities.
self_loop_scale (float): The scale on self-loop transition
probabilities.
acoustic_scale (float): Acoustic score scale.
"""
def _make_decodable(self, loglikes):
"""Constructs a new decodable object from input log-likelihoods.
Args:
loglikes (object): Input log-likelihoods.
Returns:
DecodableMatrixScaledMapped: A decodable object for computing scaled
log-likelihoods.
"""
if loglikes.num_rows == 0:
raise ValueError("Empty loglikes matrix.")
return _dec.DecodableMatrixScaledMapped(self.transition_model, loglikes,
self.acoustic_scale)
[docs]class GmmAligner(Aligner):
"""GMM based speech aligner.
This can be used to align feature matrices with reference texts.
Args:
transition_model (TransitionModel): The transition model.
acoustic_model (AmDiagGmm): The acoustic model.
tree (ContextDependency): The phonetic decision tree.
lexicon (StdFst): The lexicon FST.
symbols (SymbolTable): The symbol table. If provided, "text" input of
:meth:`align` should include symbols instead of integer indices.
disambig_symbols (List[int]): Disambiguation symbols.
graph_compiler_opts (TrainingGraphCompilerOptions): Configuration
options for graph compiler.
beam (float): Decoding beam used in alignment.
transition_scale (float): The scale on non-self-loop transition
probabilities.
self_loop_scale (float): The scale on self-loop transition
probabilities.
acoustic_scale (float): Acoustic score scale.
"""
def __init__(self, transition_model, acoustic_model, tree, lexicon,
symbols=None, disambig_symbols=None, graph_compiler_opts=None,
beam=200.0, transition_scale=1.0, self_loop_scale=1.0,
acoustic_scale=0.1):
if not isinstance(acoustic_model, _gmm_am.AmDiagGmm):
raise TypeError("acoustic_model should be a AmDiagGmm object")
self.acoustic_model = acoustic_model
super(GmmAligner, self).__init__(transition_model, tree, lexicon,
symbols, disambig_symbols,
graph_compiler_opts, beam,
transition_scale, self_loop_scale,
acoustic_scale)
[docs] @staticmethod
def read_model(model_rxfilename):
"""Reads model from an extended filename.
Returns:
Tuple[TransitionModel, AmDiagGmm]: A (transition model, acoustic
model) pair.
"""
with _util_io.xopen(model_rxfilename) as ki:
transition_model = _hmm.TransitionModel().read(ki.stream(),
ki.binary)
acoustic_model = _gmm_am.AmDiagGmm().read(ki.stream(), ki.binary)
return transition_model, acoustic_model
[docs] @classmethod
def from_files(cls, model_rxfilename, tree_rxfilename, lexicon_rxfilename,
symbols_filename=None, disambig_rxfilename=None,
graph_compiler_opts=None, beam=200.0, transition_scale=1.0,
self_loop_scale=1.0, acoustic_scale=0.1):
"""Constructs a new GMM aligner from given files.
Args:
model_rxfilename (str): Extended filename for reading the model.
tree_rxfilename (str): Extended filename for reading the phonetic
decision tree.
lexicon_rxfilename (str): Extended filename for reading the lexicon
FST.
symbols_filename (str): The symbols file. If provided, "text" input
of :meth:`align` should include symbols instead of integer
indices.
disambig_rxfilename (str): Extended filename for reading the list
of disambiguation symbols.
graph_compiler_opts (TrainingGraphCompilerOptions): Configuration
options for graph compiler.
beam (float): Decoding beam used in alignment.
transition_scale (float): The scale on non-self-loop transition
probabilities.
self_loop_scale (float): The scale on self-loop transition
probabilities.
acoustic_scale (float): Acoustic score scale.
Returns:
A new aligner object.
"""
transition_model, acoustic_model = cls.read_model(model_rxfilename)
tree = cls.read_tree(tree_rxfilename)
lexicon = cls.read_lexicon(lexicon_rxfilename)
symbols = cls.read_symbols(symbols_filename)
disambig_symbols = cls.read_disambig_symbols(disambig_rxfilename)
return cls(transition_model, acoustic_model, tree, lexicon, symbols,
disambig_symbols, graph_compiler_opts, beam,
transition_scale, self_loop_scale, acoustic_scale)
def _make_decodable(self, features):
"""Constructs a new decodable object from input features.
Args:
features (object): Input features.
Returns:
DecodableAmDiagGmmScaled: A decodable object for computing scaled
log-likelihoods.
"""
if features.num_rows == 0:
raise ValueError("Empty feature matrix.")
return _gmm_am.DecodableAmDiagGmmScaled(self.acoustic_model,
self.transition_model,
features, self.acoustic_scale)
[docs]class NnetAligner(Aligner):
"""Neural network based speech aligner.
This can be used to align feature matrices with reference texts.
Args:
transition_model (TransitionModel): The transition model.
acoustic_model (AmNnetSimple): The acoustic model.
tree (ContextDependency): The phonetic decision tree.
lexicon (StdFst): The lexicon FST.
symbols (SymbolTable): The symbol table. If provided, "text" input of
:meth:`align` should include symbols instead of integer indices.
disambig_symbols (List[int]): Disambiguation symbols.
graph_compiler_opts (TrainingGraphCompilerOptions): Configuration
options for graph compiler.
beam (float): Decoding beam used in alignment.
transition_scale (float): The scale on non-self-loop transition
probabilities.
self_loop_scale (float): The scale on self-loop transition
probabilities.
decodable_opts (NnetSimpleComputationOptions): Configuration options for
simple nnet3 am decodable objects.
online_ivector_period (int): Onlne ivector period. Relevant only if
online ivectors are used.
"""
def __init__(self, transition_model, acoustic_model, tree, lexicon,
symbols=None, disambig_symbols=None, graph_compiler_opts=None,
beam=200.0, transition_scale=1.0, self_loop_scale=1.0,
decodable_opts=None, online_ivector_period=10):
if not isinstance(acoustic_model, _nnet3.AmNnetSimple):
raise TypeError("acoustic_model should be a AmNnetSimple object")
self.acoustic_model = acoustic_model
nnet = self.acoustic_model.get_nnet()
_nnet3.set_batchnorm_test_mode(True, nnet)
_nnet3.set_dropout_test_mode(True, nnet)
_nnet3.collapse_model(_nnet3.CollapseModelConfig(), nnet)
if decodable_opts:
if not isinstance(decodable_opts,
_nnet3.NnetSimpleComputationOptions):
raise TypeError("decodable_opts should be either None or a "
"NnetSimpleComputationOptions object")
self.decodable_opts = decodable_opts
else:
self.decodable_opts = _nnet3.NnetSimpleComputationOptions()
self.compiler = _nnet3.CachingOptimizingCompiler.new_with_optimize_opts(
nnet, self.decodable_opts.optimize_config)
self.online_ivector_period = online_ivector_period
super(NnetAligner, self).__init__(transition_model, tree, lexicon,
symbols, disambig_symbols,
graph_compiler_opts, beam,
transition_scale, self_loop_scale,
self.decodable_opts.acoustic_scale)
[docs] @staticmethod
def read_model(model_rxfilename):
"""Reads model from an extended filename.
Returns:
Tuple[TransitionModel, AmNnetSimple]: A (transition model, acoustic
model) pair.
"""
with _util_io.xopen(model_rxfilename) as ki:
transition_model = _hmm.TransitionModel().read(ki.stream(),
ki.binary)
acoustic_model = _nnet3.AmNnetSimple().read(ki.stream(), ki.binary)
return transition_model, acoustic_model
[docs] @classmethod
def from_files(cls, model_rxfilename, tree_rxfilename, lexicon_rxfilename,
symbols_filename=None, disambig_rxfilename=None,
graph_compiler_opts=None, beam=200.0, transition_scale=1.0,
self_loop_scale=1.0, decodable_opts=None,
online_ivector_period=10):
"""Constructs a new nnet3 aligner from given files.
Args:
model_rxfilename (str): Extended filename for reading the model.
tree_rxfilename (str): Extended filename for reading the phonetic
decision tree.
lexicon_rxfilename (str): Extended filename for reading the lexicon
FST.
symbols_filename (str): The symbols file. If provided, "text" input
of :meth:`align` should include symbols instead of integer
indices.
disambig_rxfilename (str): Extended filename for reading the list
of disambiguation symbols.
graph_compiler_opts (TrainingGraphCompilerOptions): Configuration
options for graph compiler.
beam (float): Decoding beam used in alignment.
transition_scale (float): The scale on non-self-loop transition
probabilities.
self_loop_scale (float): The scale on self-loop transition
probabilities.
decodable_opts (NnetSimpleComputationOptions): Configuration options
for simple nnet3 am decodable objects.
online_ivector_period (int): Onlne ivector period. Relevant only if
online ivectors are used.
Returns:
A new aligner object.
"""
transition_model, acoustic_model = cls.read_model(model_rxfilename)
tree = cls.read_tree(tree_rxfilename)
lexicon = cls.read_lexicon(lexicon_rxfilename)
disambig_symbols = cls.read_disambig_symbols(disambig_rxfilename)
symbols = cls.read_symbols(symbols_filename)
return cls(transition_model, acoustic_model, tree, lexicon, symbols,
disambig_symbols, graph_compiler_opts, beam,
transition_scale, self_loop_scale, decodable_opts,
online_ivector_period)
def _make_decodable(self, features):
"""Constructs a new decodable object from input features.
Input can be just a feature matrix or a tuple of a feature matrix and
an ivector or a tuple of a feature matrix and an online ivector matrix.
Args:
features (Matrix or Tuple[Matrix, Vector] or Tuple[Matrix, Matrix]):
Input features.
Returns:
DecodableAmNnetSimple: A decodable object for computing scaled
log-likelihoods.
"""
ivector, online_ivectors = None, None
if isinstance(features, tuple):
features, ivector_features = features
if isinstance(ivector_features, _kaldi_matrix.MatrixBase):
online_ivectors = ivector_features
else:
ivector = ivector_features
if features.num_rows == 0:
raise ValueError("Empty feature matrix.")
return _nnet3.DecodableAmNnetSimple(
self.decodable_opts, self.transition_model, self.acoustic_model,
features, ivector, online_ivectors, self.online_ivector_period,
self.compiler)