Fix an evaluation bug in fairseq-generate (facebookresearch#1158)

Summary: Pull Request resolved: fairinternal/fairseq-py#1158 When using BPE in --sacrebleu mode, the scores were computed before BPE was removed (H- strings, not D- strings). This is now fixed. In addition, added warnings that not using --sacrebleu for scoring with target-side BPE is a bad idea. Reviewed By: myleott Differential Revision: D21260024 fbshipit-source-id: f8cf9e3a42e501043b794c841297940ab9e2b75a
krieya · Apr 27, 2020 · dd518ef · dd518ef
1 parent 7b3df95
commit dd518ef
Showing 1 changed file with 7 additions and 1 deletion.
diff --git a/fairseq_cli/generate.py b/fairseq_cli/generate.py
@@ -239,8 +239,9 @@ def decode_fn(x):
                     if align_dict is not None or args.remove_bpe is not None:
                         # Convert back to tokens for evaluation with unk replacement and/or without BPE
                         target_tokens = tgt_dict.encode_line(target_str, add_if_not_exist=True)
+                        hypo_tokens = tgt_dict.encode_line(detok_hypo_str, add_if_not_exist=True)
                     if hasattr(scorer, 'add_string'):
-                        scorer.add_string(target_str, hypo_str)
+                        scorer.add_string(target_str, detok_hypo_str)
                     else:
                         scorer.add(target_tokens, hypo_tokens)
 
@@ -252,6 +253,11 @@ def decode_fn(x):
     logger.info('Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'.format(
         num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg))
     if has_target:
+        if args.bpe and not args.sacrebleu:
+            if args.remove_bpe:
+                logger.warning("BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization")
+            else:
+                logger.warning("If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words.  Use --sacrebleu for standard 13a BLEU tokenization")
         logger.info('Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.result_string()))
 
     return scorer