Skip to content

Commit

Permalink
filtrate by delta
Browse files Browse the repository at this point in the history
  • Loading branch information
GreenWizard2015 committed Sep 19, 2024
1 parent 0446554 commit 159c4fb
Showing 1 changed file with 55 additions and 29 deletions.
84 changes: 55 additions & 29 deletions scripts/preprocess-remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def splitDataset(dataset, ratio, framesPerChunk, skipAction):
for i, (start, end) in enumerate(dataset):
trainingIdx = testingIdx = []
if (end - start) < 2 * framesPerChunk:
print('Session %d is too short. Action: %s' % (i, skipAction))
# print('Session %d is too short. Action: %s' % (i, skipAction))
if 'drop' == skipAction: continue

rng = np.arange(start, end)
Expand All @@ -93,6 +93,9 @@ def splitDataset(dataset, ratio, framesPerChunk, skipAction):
if 0 < len(trainingIdx): trainingSet.append(trainingIdx)
if 0 < len(testingIdx): testing.append(testingIdx)
continue
if (0 == len(trainingSet)) or (0 == len(testing)):
print('No training or testing sets was created!')
return [], []
# save training and testing sets
testing = np.sort(np.concatenate(testing))
training = np.sort(np.concatenate(trainingSet))
Expand All @@ -108,6 +111,7 @@ def splitDataset(dataset, ratio, framesPerChunk, skipAction):

def dropPadding(idx, padding):
res = []
if len(idx) < 2: return res
# find consecutive frames chunks, save their start and end indices
gaps = np.where(1 < np.diff(idx))[0]
gaps = np.concatenate(([0], 1 + gaps, [len(idx)]))
Expand All @@ -120,19 +124,43 @@ def dropPadding(idx, padding):
res = [chunk[padding:-padding] for chunk in res]
# remove chunks that are too short
res = [chunk for chunk in res if padding < len(chunk)]
res = np.concatenate(res)
res = np.concatenate(res) if 0 < len(res) else []
print('Frames before: {}. Frames after: {}'.format(len(idx), len(res)))
return res

def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, skippedFrames, minimumFrames):
def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, skippedFrames, minimumFrames, dropZeroDeltas):
print('Processing', folder)
dataset = loadNpz(folder)
for k, v in dataset.items():
print(k, v.shape)
# load all.npz file if it exists
all_file = os.path.join(folder, 'all.npz')
if os.path.exists(all_file):
dataset = loadNpz(all_file)
else:
dataset = loadNpz(folder)
np.savez(all_file, **dataset)

if len(dataset['time']) < minimumFrames:
# remove the npz files, except for all.npz
files = os.listdir(folder)
for fn in files:
if fn.endswith('.npz') and not ('all.npz' == fn):
os.remove(os.path.join(folder, fn))
print('Removed', len(files), 'files')

if dropZeroDeltas: # drop frames with zero time deltas
deltas = np.diff(dataset['time'])
idx = np.where(0 == deltas)[0]
print('Dropping {} frames with zero time deltas'.format(len(idx)))
dataset = {k: np.delete(v, idx) for k, v in dataset.items()}

N = len(dataset['time'])
# print total deltas statistics
print('Dataset: {} frames'.format(N))
deltas = np.diff(dataset['time'])
print('Total time deltas: min={}, max={}, mean={}'.format(np.min(deltas), np.max(deltas), np.mean(deltas)))
deltas = None

if N < minimumFrames:
print('Dataset is too short. Skipping...')
return 0, 0
return 0, 0, True
# split dataset into sessions
sessions = Utils.extractSessions(dataset, float(timeDelta))
# print sessions and their durations for debugging
Expand All @@ -142,14 +170,11 @@ def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, ski
session_time = dataset['time'][idx]
delta = np.diff(session_time)
duration = session_time[-1] - session_time[0]
print('Session {}: {} - {} ({}, {})'.format(i, start, end, end - start, duration))
# print also min, max, and mean time deltas
print('Time deltas in session {}: min={}, max={}, mean={}'.format(i, np.min(delta), np.max(delta), np.mean(delta)))
print('Session {} - {}: min={}, max={}, mean={}, frames={}, duration={} sec'.format(
start, end, np.min(delta), np.max(delta), np.mean(delta), len(session_time), duration
))
continue
# print total deltas statistics
deltas = np.diff(dataset['time'])
print('Total time deltas: min={}, max={}, mean={}'.format(np.min(deltas), np.max(deltas), np.mean(deltas)))
deltas = None
######################################################
# split each session into training and testing sets
training, testing = splitDataset(
Expand All @@ -158,10 +183,13 @@ def processFolder(folder, timeDelta, testRatio, framesPerChunk, testPadding, ski
framesPerChunk=int(framesPerChunk),
skipAction=skippedFrames,
)

if 0 < testPadding:
testing = dropPadding(testing, testPadding)

if (0 == len(training)) or (0 == len(testing)):
print('No training or testing sets found!')
return 0, 0, True

def saveSubset(filename, idx):
print('%s: %d frames' % (filename, len(idx)))
subset = {k: v[idx] for k, v in dataset.items()}
Expand All @@ -170,18 +198,12 @@ def saveSubset(filename, idx):
assert np.all(diff >= 0), 'Time is not monotonically increasing!'
np.savez(os.path.join(folder, filename), **subset)
return

# remove the npz files
files = os.listdir(folder)
for fn in files:
os.remove(os.path.join(folder, fn))
print('Removed', len(files), 'files')
# save training and testing sets
saveSubset('train.npz', training)
saveSubset('test.npz', testing)

print('Processing ', folder, 'done')
return len(testing), len(training)
return len(testing), len(training), False

def main(args):
stats = {
Expand All @@ -208,16 +230,19 @@ def main(args):
if not (sid in stats['screenId']):
stats['screenId'].append(sid)
path = os.path.join(folder, placeId, userId, screenId)
testFramesN, trainFramesN = processFolder(
testFramesN, trainFramesN, isSkipped = processFolder(
path,
args.time_delta, args.test_ratio, args.frames_per_chunk,
args.test_padding, args.skipped_frames
args.test_padding, args.skipped_frames,
minimumFrames=args.minimum_frames,
dropZeroDeltas=args.drop_zero_deltas
)
testFrames += testFramesN
trainFrames += trainFramesN
# store the number of frames per chunk
sid = '%s/%s/%s' % (placeId, userId, screenId)
framesPerChunk[sid] = testFramesN + trainFramesN
if not isSkipped:
testFrames += testFramesN
trainFrames += trainFramesN
# store the number of frames per chunk
sid = '%s/%s/%s' % (placeId, userId, screenId)
framesPerChunk[sid] = testFramesN + trainFramesN
continue
print('Total: %d training frames, %d testing frames' % (trainFrames, testFrames))

Expand Down Expand Up @@ -246,6 +271,7 @@ def main(args):
help='What to do with skipped frames ("train", "test", or "drop")'
)
parser.add_argument('--minimum-frames', type=int, default=0, help='Minimum number of frames in a dataset')
parser.add_argument('--drop-zero-deltas', action='store_true', help='Drop frames with zero time deltas')
args = parser.parse_args()
main(args)
pass

0 comments on commit 159c4fb

Please sign in to comment.