Publish SCAN MCD splits.

PiperOrigin-RevId: 301568668
johko · Mar 18, 2020 · 7078608 · 7078608
1 parent 335d8b9
commit 7078608
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 2 deletions.
diff --git a/cfq/README.md b/cfq/README.md
@@ -5,7 +5,7 @@ the Compositional Freebase Questions (CFQ) dataset.
 
 The dataset can be downloaded from the following URL:
 
-[Download the CFQ dataset](https://storage.cloud.google.com/cfq_dataset/cfq.tar.gz)
+[Download the CFQ dataset](https://storage.cloud.google.com/cfq_dataset/cfq1.1.tar.gz)
 
 The dataset and details about its construction and use are described in this ICLR 2020 paper: [Measuring Compositional Generalization: A Comprehensive Method on Realistic Data](https://openreview.net/forum?id=SygcCnNKwr).
 
@@ -62,6 +62,12 @@ splits. Accuracies vary between 5% and 37% over splits and architectures:
 | MCD2  |  5.0 +/- 0.8 |  8.2 +/- 0.3 |  8.1 +/- 1.6 |
 | MCD3  | 10.8 +/- 0.6 | 10.6 +/- 1.1 | 11.3 +/- 0.3 |
 
+## SCAN MCD splits
+We also publish the SCAN MCD splits from our paper. In order to run over those
+please download the dataset from the [original source](https://github.com/brendenlake/SCAN),
+set `dataset_local_path` to point to the tasks.txt file and adjust `split_path`
+to point to one of the mcd.json files from our [scan archive](https://storage.cloud.google.com/cfq_dataset/scan-splits.tar.gz).
+
 ## Tensorflow datasets
 
 Our dataset and splits are also part of [TensorFlow Datasets](https://www.tensorflow.org/datasets)

diff --git a/cfq/preprocess.py b/cfq/preprocess.py
@@ -37,6 +37,27 @@ def load_json(path):
   return data
 
 
+def load_scan(path):
+  """Read original scan task data and convert into CFQ-style json format."""
+  logging.info(f'Reading SCAN tasks from {path}.')
+  def parse(infile):
+    for line in infile.read().split('\n'):
+      if not line.startswith('IN: '):
+        continue
+      commands, actions = line[len('IN: '):].strip().split(' OUT: ', 1)
+      yield {'questionPatternModEntities': commands,
+             'sparqlPatternModEntities': actions}
+  return list(parse(gfile.GFile(path)))
+
+
+def load_dataset(path):
+  """Load dataset from .json or SCAN task format."""
+  if path[-5:] == '.json':
+    return load_json(path)
+  else:
+    return load_scan(path)
+
+
 def tokenize_punctuation(text):
   text = map(lambda c: f' {c} ' if c in string.punctuation else c, text)
   return ' '.join(''.join(text).split())

diff --git a/cfq/preprocess_main.py b/cfq/preprocess_main.py
@@ -49,7 +49,7 @@ def main(argv):
     raise app.UsageError('Too many command-line arguments.')
 
   dataset = preprocessor.get_dataset(
-      preprocessor.load_json(FLAGS.dataset_path),
+      preprocessor.load_dataset(FLAGS.dataset_path),
       preprocessor.load_json(FLAGS.split_path))
   preprocessor.write_dataset(dataset, FLAGS.save_path)
   token_vocab = preprocessor.get_token_vocab(FLAGS.save_path)