utils/asr-score_en

#!/usr/bin/env python3
# coding=utf8
# Copyright  2022  Zhenxiang MA (Tsinghua)
# Copyright  2022  Jiayu DU (SpeechColab)

import sys, os
import argparse
from typing import Iterable
import json, csv
import logging
logging.basicConfig(stream=sys.stderr, level=logging.INFO, format='[%(levelname)s] %(message)s')

import pynini
from pynini.lib import pynutil

# reference: https://github.com/kylebgorman/pynini/blob/master/pynini/lib/edit_transducer.py
# to import original lib:
#     from pynini.lib.edit_transducer import EditTransducer
class EditTransducer:
    DELETE = "<delete>"
    INSERT = "<insert>"
    SUBSTITUTE = "<substitute>"

    def __init__(self,
        symbol_table,
        vocab: Iterable[str],
        edit_ext: pynini.FstLike,
        insert_cost: float = 1.0,
        delete_cost: float = 1.0,
        substitute_cost: float = 1.0,
        bound: int = 0,
    ) :
        # Left factor; note that we divide the edit costs by two because they also
        # will be incurred when traversing the right factor.
        sigma = pynini.union(
            *[ pynini.accep(token, token_type = symbol_table) for token in vocab ], 
        ).optimize()

        insert = pynutil.insert(f"[{self.INSERT}]", weight=insert_cost / 2)
        delete = pynini.cross(
            sigma,
            pynini.accep(f"[{self.DELETE}]", weight=delete_cost / 2)
        )
        substitute = pynini.cross(
            sigma, pynini.accep(f"[{self.SUBSTITUTE}]", weight=substitute_cost / 2)
        )

        edit = pynini.union(insert, delete, substitute).optimize()

        if bound:
            sigma_star = pynini.closure(sigma)
            self._e_i = sigma_star.copy()
            for _ in range(bound):
                self._e_i.concat(edit.ques).concat(sigma_star)
        else:
            self._e_i = edit.union(sigma).closure()

        self._e_i.optimize()
        self._e_o = EditTransducer._right_factor(self._e_i, edit_ext)

    @staticmethod
    def _right_factor(ifst: pynini.Fst, edit_ext: pynini.FstLike) -> pynini.Fst:
        ofst = pynini.invert(ifst)
        syms = pynini.generated_symbols()
        insert_label = syms.find(EditTransducer.INSERT)
        delete_label = syms.find(EditTransducer.DELETE)
        pairs = [(insert_label, delete_label), (delete_label, insert_label)]
        right_factor = ofst.relabel_pairs(ipairs=pairs)

        right_factor_aux = pynini.union(right_factor, edit_ext).closure().optimize()
        #print('R_start:', right_factor_aux.start())
        #print('R:', right_factor_aux)
        return right_factor_aux

    def create_lattice(self, iexpr: pynini.FstLike, oexpr: pynini.FstLike) -> pynini.Fst:
        lattice = (iexpr @ self._e_i) @ (self._e_o @ oexpr)
        EditTransducer.check_wellformed_lattice(lattice)
        return lattice

    @staticmethod
    def check_wellformed_lattice(lattice: pynini.Fst) -> None:
        if lattice.start() == pynini.NO_STATE_ID:
            raise RuntimeError("Edit distance composition lattice is empty.")

    def compute_distance(self, iexpr: pynini.FstLike, oexpr: pynini.FstLike) -> float:
        lattice = self.create_lattice(iexpr, oexpr)
        # The shortest cost from all final states to the start state is
        # equivalent to the cost of the shortest path.
        start = lattice.start()
        return float(pynini.shortestdistance(lattice, reverse=True)[start])
  
    def compute_alignment(self, iexpr: pynini.FstLike, oexpr: pynini.FstLike) -> pynini.FstLike:
        lattice = self.create_lattice(iexpr, oexpr)
        alignment = pynini.shortestpath(lattice, nshortest=1, unique=True)
        return alignment.optimize()


class EvaluationResult:
    def __init__(self):
        self.num_ref_utts = 0
        self.num_hyp_utts = 0
        self.num_eval_utts = 0 # seen in both ref & hyp
        self.num_hyp_without_ref = 0

        self.C = 0
        self.S = 0
        self.I = 0
        self.D = 0
        self.token_error_rate = 0.0

        self.num_utts_with_error = 0
        self.sentence_error_rate = 0.0
    
    def to_json(self):
        #return json.dumps(self.__dict__, indent=4)
        return json.dumps(self.__dict__)
    
    def to_kaldi(self):
        info = (
            F'%WER {self.token_error_rate:.2f} [ {self.S + self.D + self.I} / {self.C + self.S + self.D}, {self.I} ins, {self.D} del, {self.S} sub ]\n'
            F'%SER {self.sentence_error_rate:.2f} [ {self.num_utts_with_error} / {self.num_eval_utts} ]\n'
        )
        return info
    
    def to_summary(self):
        summary = (
            '==================== Overall Statistics ====================\n'
            F'num_ref_utts: {self.num_ref_utts}\n'
            F'num_hyp_utts: {self.num_hyp_utts}\n'
            F'num_hyp_without_ref: {self.num_hyp_without_ref}\n'
            F'num_eval_utts: {self.num_eval_utts}\n'
            F'sentence_error_rate: {self.sentence_error_rate:.2f}%\n'
            F'token_error_rate: {self.token_error_rate:.2f}%\n'
            F'token_stats:\n'
            F'  - tokens:{self.C + self.S + self.D:>7}\n'
            F'  - edits: {self.S + self.I + self.D:>7}\n'
            F'  - cor:   {self.C:>7}\n'
            F'  - sub:   {self.S:>7}\n'
            F'  - ins:   {self.I:>7}\n'
            F'  - del:   {self.D:>7}\n'
            '============================================================\n'
        )
        return summary


class Utterance:
    def __init__(self, uid, text):
        self.uid = uid
        self.text = text


def LoadUtterances(filepath, format):
    utts = {}
    if format == 'text': # utt_id word1 word2 ...
        with open(filepath, 'r', encoding='utf8') as f:
            for line in f:
                line = line.strip()
                if line:
                    cols = line.split(maxsplit=1)
                    assert(len(cols) == 2 or len(cols) == 1)
                    uid = cols[0]
                    text = cols[1] if len(cols) == 2 else ''
                    if utts.get(uid) != None:
                        raise RuntimeError(F'Found duplicated utterence id {uid}')
                    utts[uid] = Utterance(uid, text)
    else:
        raise RuntimeError(F'Unsupported text format {format}')
    return utts


def Tokenize(text, tokenizer = 'whitespace'):
    if tokenizer == 'whitespace':
        return text.split()
    elif tokenizer == 'char':
        return [ c for c in ''.join(text.split()) ]
    else:
        raise RuntimeError(F'ERROR: Unsupported tokenizer {tokenizer}')


def tokens_without_hyphen(token : str):
    # 'T-SHIRT' should also introduce new words into vocabulary, e.g.:
    #   1. 'T' & 'SHIRT'
    #   2. 'TSHIRT'
    assert('-' in token)
    v = token.split('-')
    v.append(token.replace('-', ''))
    return v


def LoadGLM(rel_path):
    '''
    glm.csv:
        I'VE,I HAVE
        GOING TO,GONNA
        ...
        T-SHIRT,T SHIRT,TSHIRT

    glm:
        {
            '<RULE00000>': ["I'VE", 'I HAVE'],
            '<RULE00001>': ['GOING TO', 'GONNA'],
            ...
            '<RULE99999>': ['T-SHIRT', 'T SHIRT', 'TSHIRT'],
        }
    '''
    logging.info(f'Loading GLM from {rel_path} ...')

    abs_path = os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path
    reader = list(
        csv.reader(open(abs_path, encoding="utf-8"), delimiter=',')
    )

    glm = {}
    #glm_phrases = {}
    for k, rule in enumerate(reader):
        rule_name = f'<RULE_{k:05d}>'
        rule = [ phrase.strip() for phrase in rule ]
        glm[rule_name] = rule
        #glm_phrases[rule] = rule_name
    logging.info(f'  #rule: {len(glm)}')

    tokens = []
    for rule in glm.values():
        for phrase in rule:
            for t in Tokenize(phrase):
                tokens.append(t)
                if '-' in t:
                    tokens.extend(tokens_without_hyphen(t))
    vocab = list(set(tokens))
    logging.info(f'  #vocab: {len(vocab)}')

    return glm, vocab


def SymbolEQ(symtab, i1, i2):
    return symtab.find(i1).strip('#') == symtab.find(i2).strip('#')


def PrintSymbolTable(symbol_table: pynini.SymbolTable):
    print('SYMBOL_TABLE:')
    for k in range(symbol_table.num_symbols()):
        sym = symbol_table.find(k)
        assert(symbol_table.find(sym) == k) # symbol table's find can be used for bi-directional lookup (id <-> sym)
        print(k, sym)
    print()


def BuildSymbolTable(*vocabs) -> pynini.SymbolTable :
    logging.info('Building symbol table ...')
    symbol_table = pynini.SymbolTable()
    symbol_table .add_symbol('<epsilon>')

    for vocab in vocabs:
        for token in vocab:
            symbol_table.add_symbol(token)
    logging.info(f'  #symbols: {symbol_table.num_symbols()}')

    return symbol_table


def BuildGLMTagger(glm, symbol_table):
    logging.info('Building tagger for GLM rewriting rules ...')
    rule_taggers = []
    for rule_tag, rule in glm.items():
        for phrase in rule:
            rule_taggers.append(
                (
                    pynutil.insert(pynini.accep(rule_tag, token_type = symbol_table)) +
                    pynini.accep(phrase, token_type = symbol_table) +
                    pynutil.insert(pynini.accep(rule_tag, token_type = symbol_table))
                )
            )

    alphabet = pynini.union(
        *[ pynini.accep(sym, token_type = symbol_table) for idx, sym in symbol_table if idx != 0 ]
    ).optimize()

    tagger = pynini.cdrewrite(
        pynini.union(*rule_taggers),
        '',
        '',
        alphabet.closure()
    ).optimize() # note: this could be slow with large vocabulary
    logging.info('GLM tagger built.')

    return tagger


def PrintPrettyAlignment(raw, hyp, ref, edits, stream = sys.stderr):
    def fullwidth_char(c):
        return True if (c >= '\u4e00') and (c <= '\u9fa5') else False
    
    def token_width(token: str):
        n = 0
        for c in token:
            n += (2 if fullwidth_char(c) else 1)
        return n

    H = '  HYP# : '
    R = '  REF  : '
    E = '  EDIT : '

    hyp = Tokenize(hyp.strip())
    ref = Tokenize(ref.strip())
    l1 = 0
    l2 = 0
    for i in range(len(edits)):
        if edits[i] != 'D'and l1 < len(hyp):
            h = hyp[l1]
            l1+=1
        else:
            h = '*'

        if edits[i] != 'I' and l2 < len(ref):
            r = ref[l2]
            l2+=1
        else:
            r = '*'  

        e = '' if edits[i] == 'C' else edits[i]

        nr, nh, ne = token_width(r), token_width(h), token_width(e)
        n = max(nr, nh, ne) + 1

        R += r + ' ' * (n-nr)
        H += h + ' ' * (n-nh)
        E += e + ' ' * (n-ne)

    print('  HYP  : ' + raw, file = stream)
    print(H, file = stream)
    print(R, file = stream)
    print(E, file = stream)


def ComputeTokenErrorRate(c, s, i, d):
    assert((s + d + c) != 0)
    return 100.0 * (s + d + i) / (s + d + c)


def ComputeSentenceErrorRate(num_err_utts, num_utts):
    assert(num_utts != 0)
    return 100.0 * num_err_utts / num_utts


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # optional
    parser.add_argument('--tokenizer', choices=['whitespace', 'char'], default='whitespace', help='whitespace for WER, char for CER')
    parser.add_argument('--ref-format', choices=['text'], default='text', help='reference format, first col is utt_id, the rest is text')
    parser.add_argument('--hyp-format', choices=['text'], default='text', help='hypothesis format, first col is utt_id, the rest is text')
    # required
    parser.add_argument('--ref', type=str, required=True, help='input reference file')
    parser.add_argument('--hyp', type=str, required=True, help='input hypothesis file')
    parser.add_argument('result_file', type=str)
    args = parser.parse_args()
    logging.info(args)


    logging.info('Loading REF & HYP ...')
    ref_utts = LoadUtterances(args.ref, args.ref_format)
    hyp_utts = LoadUtterances(args.hyp, args.hyp_format)

    res = EvaluationResult()

    # check valid utterances in hyp that have matched non-empty reference
    eval_utts = []
    for uid in sorted(hyp_utts.keys()):
        if uid in ref_utts.keys(): 
            if ref_utts[uid].text.strip(): # non-empty reference
                eval_utts.append(uid)
            else:
                logging.warning(F'Found {uid} with empty reference, skipping...')
        else:
            logging.warning(F'Found {uid} without reference, skipping...')
            res.num_hyp_without_ref += 1

    res.num_hyp_utts  = len(hyp_utts)
    res.num_ref_utts  = len(ref_utts)
    res.num_eval_utts = len(eval_utts)
    logging.info(f'  #hyp:{res.num_hyp_utts}, #ref:{res.num_ref_utts}, #eval:{res.num_eval_utts}')

    tokens = []
    for uid in eval_utts:
        ref_tokens = Tokenize(ref_utts[uid].text)
        hyp_tokens = Tokenize(hyp_utts[uid].text)
        for t in ref_tokens + hyp_tokens:
            tokens.append(t)
            if '-' in t:
                tokens.extend(tokens_without_hyphen(t))
    vocab = list(set(tokens))
    #print(vocab)
    logging.info(f'  #vocab: {len(vocab)}')


    glm, vocab_glm = LoadGLM('glm_en.csv')
    #print(glm)
    #print(vocab_glm)


    symtab = BuildSymbolTable(
        vocab,
        vocab_glm,
        [ x + '#' for x in vocab_glm ], # we need this auxiliary version to mark GLM-expanded paths
        [ x for x in glm.keys() ], # GLM rule tags
    )
    #PrintSymbolTable(symtab)
    #symtab.write_text('symbol_table.txt')


    tagger = BuildGLMTagger(glm, symtab)

    # auxiliary edit transducer allows 0-cost matches between token and its auxiliary form
    # e.g.: 'I' -> 'I#', 'AM' -> 'AM#'
    auxiliary_edit_transducer = pynini.union(*[
        pynini.cross(
            pynini.accep(token,       token_type = symtab),
            pynini.accep(token + '#', token_type = symtab),
        ) for token in vocab_glm
    ]).optimize().closure()

    # processing HYP & REF 
    logging.info('Evaluating error rate ...')
    with open(args.result_file, 'w+', encoding='utf8') as fo:
        for uid in eval_utts:
            #print(uid, flush=True, file=sys.stderr)

            ref = ref_utts[uid].text
            hyp = hyp_utts[uid].text

            ref_tokens = Tokenize(ref)
            hyp_tokens = Tokenize(hyp)

            ref_fst = pynini.accep(' '.join(ref_tokens), token_type = symtab)
            #print(ref_fst.string(token_type = symtab))

            hyp_fst = pynini.accep(' '.join(hyp_tokens), token_type = symtab)
            #print(hyp_fst.string(token_type = symtab))

            # Say, we have
            #   GLM contains a rule: "I'M" <-> "I AM"
            #   REF: I AM HERE
            #   HYP: I'M HERE
            #
            # We want to expand HYP with alternative paths(marked with auxiliary #), following GLM rewriting rules:
            #   HYP#: {I'M | I# AM#} HERE
            # REF is honored to keep its original form.
            #
            # This could be considered as a flexible on-the-fly TN.

            # 1. GLM rule tagging:
            #   I'M HERE  ->  <RULE_001> I'M <RULE_001> HERE
            tagged_lattice = (hyp_fst @ tagger).optimize()
            tagged_text = pynini.shortestpath(tagged_lattice, nshortest=1, unique=True).string(token_type = symtab)
            #print(tagged_text)

            # 2. GLM rule expanding: 
            # <RULE_001> I'M <RULE_001> HERE  ->  {I'M | I# AM#} HERE
            # <TODO>: too messy, need further cleanup ......
            tagged_tokens = tagged_text.split()
            tagged_fst = pynini.accep('', token_type = symtab)
            i = 0
            while i < len(tagged_tokens):
                word = tagged_tokens[i]
                if "<RULE_" not in word:
                    if "-" not in word:
                        tagged_fst +=pynini.accep(word, token_type =symtab)
                    else:
                        words = word.split("-")
                        word1 = words[0]
                        connect_graph0 = pynini.accep(words[0],token_type =symtab)
                        connect_graph = pynini.accep(word,token_type =symtab)
                        
                        flag0 = False
                        for k in range(1,len(words)):
                            connect_graph0 +=pynini.accep(words[k],token_type =symtab)
                            word1+=words[k]
                        symtab.add_symbol(word1)
                        connect_graph1 = pynini.accep(word1,token_type =symtab)
                        tagged_fst +=pynini.union(connect_graph,connect_graph0,connect_graph1).optimize()
                    i = i+1
                else:
                    key_name = word
                    list1 = glm[key_name]
                    part_word = ""
                    i = i+1
                    while "<RULE_" not in tagged_tokens[i]:
                        part_word += tagged_tokens[i] + " "
                        i = i+1                   
                    if "<RULE_" in word:
                        i = i + 1
                    temp_graph = pynini.accep("",token_type =symtab)
                    temp_flag = False
                    for synword in list1:
                        part_word = part_word.strip()
                        if synword == part_word:
                            ttemp_graph = pynini.accep("",token_type =symtab)
                            if len(synword.split())>1:
                                synwords = synword.split()
                                for j in range(len(synwords)):
                                    ttemp_graph+=pynini.accep(synwords[j],token_type = symtab)
                                if not temp_flag:
                                    temp_graph = ttemp_graph.optimize()
                                    temp_flag = True
                                else:
                                    temp_graph|=ttemp_graph.optimize()
                            else:
                                ttemp_graph = pynini.accep(synword,token_type = symtab)
                                if not temp_flag:
                                    temp_graph = ttemp_graph.optimize()
                                    temp_flag = True
                                else:
                                    temp_graph|=ttemp_graph.optimize()
                        else:
                            ttemp_graph = pynini.accep("",token_type =symtab)
                            if len(synword.split())>1:
                                synwords = synword.split()
                                # print(synwords)
                                for j in range(len(synwords)):
                                    ttemp_graph+=pynini.accep(synwords[j] + "#",token_type = symtab)                                                               
                                if not temp_flag:
                                    temp_graph = ttemp_graph.optimize()
                                    temp_flag = True
                                else:
                                    temp_graph|=ttemp_graph.optimize()
                            else:
                                ttemp_graph = pynini.accep(synword + "#",token_type = symtab)
                                if not temp_flag:
                                    temp_graph = ttemp_graph.optimize()
                                    temp_flag = True
                                else:
                                    temp_graph|=ttemp_graph.optimize()
                    tagged_fst +=temp_graph.optimize()
            tagged_fst = tagged_fst.optimize()
            # verb_lattice = (tagged_fst @ verb).optimize()
            # verb_text = pynini.shortestpath(verb_lattice, nshortest=1, unique=True).string(token_type = symtab)  
            # print(verb_text)       
            # </TODO>

            et = EditTransducer(
                symbol_table = symtab,            
                vocab = list(set(ref_tokens + hyp_tokens)), # contains vocabulary covering current utterance only
                edit_ext = auxiliary_edit_transducer
            )
            alignment = et.compute_alignment(ref_fst, tagged_fst)

            C, S, I, D = 0, 0, 0, 0
            edits = []
            distance = 0.0
            for state in alignment.states():
                for arc in alignment.arcs(state):
                    i, o = arc.ilabel, arc.olabel
                    if i != 0 and o != 0 and not SymbolEQ(symtab, i, o):
                        S += 1
                        e = 'S'
                        distance += 1.0
                    elif i != 0 and o == 0:
                        D += 1
                        e = 'D'
                        distance += 1.0
                    elif i == 0 and o != 0:
                        I += 1
                        e = 'I'
                        distance += 1.0
                    elif SymbolEQ(symtab, i, o):
                        C += 1
                        e = 'C'
                    else:
                        raise RuntimeError
                    edits.append(e)
                    # print('i:', i, symtab.find(i), 'o:', o, symtab.find(o), f'[{e}]')
            #assert(distance == et.compute_distance(ref_fst, tagged_fst)) # this should be used only in debugging checks, it doubles computations
            utt_ter = ComputeTokenErrorRate(C, S, I, D)

            # utt-level evaluation result
            print(F'{{"uid":{uid}, "score":{-distance}, "ter":{utt_ter:.2f}, "cor":{C}, "sub":{S}, "ins":{I}, "del":{D}}}', file=fo)

            hyp_aux = alignment.string(token_type = symtab)
            PrintPrettyAlignment(hyp, hyp_aux, ref, edits, fo)

            if utt_ter > 0:
                res.num_utts_with_error += 1

            res.C += C
            res.S += S
            res.I += I
            res.D += D

        # corpus level evaluation result
        res.token_error_rate = ComputeTokenErrorRate(res.C, res.S, res.I, res.D)
        res.sentence_error_rate = ComputeSentenceErrorRate(res.num_utts_with_error, res.num_eval_utts)

        print(res.to_summary(), file=fo)

    print(res.to_json())
    print(res.to_kaldi())