Skip to content

Commit

Permalink
Ignore faulty training data.
Browse files Browse the repository at this point in the history
We just fixed a bug where training data for the position after 2 pass
moves was being recorded. In order to not throw away a few thousand
games, detect and skip this data in the chunk parser for now.
  • Loading branch information
gcp committed Nov 13, 2017
1 parent 5e5a7f8 commit 4157d2c
Showing 1 changed file with 12 additions and 3 deletions.
15 changes: 12 additions & 3 deletions training/tf/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import glob
import gzip
import random
import math
import multiprocessing as mp
import tensorflow as tf
from tfprocess import TFProcess
Expand Down Expand Up @@ -92,15 +93,21 @@ def convert_train_data(text_item):
planes.append([0.0] * 361)
planes.append([1.0] * 361)
assert len(planes) == 18
probabilities = [float(val) for val in text_item[17].split()]
probabilities = []
for val in text_item[17].split():
float_val = float(val)
# Work around a bug in leela-zero v0.3
if math.isnan(float_val):
return False, None
probabilities.append(float_val)
assert len(probabilities) == 362
winner = float(text_item[18])
assert winner == 1.0 or winner == -1.0
# Get one of 8 symmetries
symmetry = random.randrange(8)
sym_planes = [apply_symmetry(plane, symmetry) for plane in planes]
sym_probabilities = apply_symmetry(probabilities, symmetry)
return sym_planes, sym_probabilities, [winner]
return True, (sym_planes, sym_probabilities, [winner])

class ChunkParser:
def __init__(self, chunks):
Expand All @@ -123,7 +130,9 @@ def task(self, chunks, queue):
pick_offset = item_idx * DATA_ITEM_LINES
item = file_content[pick_offset:pick_offset + DATA_ITEM_LINES]
str_items = [str(line, 'ascii') for line in item]
queue.put(convert_train_data(str_items))
success, data = convert_train_data(str_items)
if success:
queue.put(data)

def parse_chunk(self):
while True:
Expand Down

0 comments on commit 4157d2c

Please sign in to comment.