Skip to content

Commit

Permalink
Merge pull request nltk#58 from simonrichard/gh-pages
Browse files Browse the repository at this point in the history
Add Dolch word list
  • Loading branch information
stevenbird authored Apr 14, 2017
2 parents 1164703 + 1d6ca44 commit e93c1eb
Show file tree
Hide file tree
Showing 6 changed files with 10 additions and 0 deletions.
1 change: 1 addition & 0 deletions collections/all-corpora.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
<item ref="conll2007"/>
<item ref="crubadan"/>
<item ref="dependency_treebank"/>
<item ref="dolch"/>
<item ref="floresta"/>
<item ref="framenet_v15"/>
<item ref="framenet_v17"/>
Expand Down
1 change: 1 addition & 0 deletions collections/all.xml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
<item ref="conll2007"/>
<item ref="crubadan"/>
<item ref="dependency_treebank"/>
<item ref="dolch"/>
<item ref="europarl_raw"/>
<item ref="floresta"/>
<item ref="framenet_v15"/>
Expand Down
3 changes: 3 additions & 0 deletions index.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
<package checksum="b9015928e35c41f0695525289df5208f" contact="Kepa Sarasola" copyright="Copyright (C) 2007 The University of the Basque Country" id="conll2007" license="Creative Commons Attribution-NonCommercial-NoDerivativeWorks license" name="Dependency Treebanks from CoNLL 2007 (Catalan and Basque Subset)" size="1242958" subdir="corpora" unzip="0" unzipped_size="6399295" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/conll2007.zip" webpage="http://nextens.uvt.nl/depparse-wiki/DataDownload" />
<package author="Kevin Scannell" checksum="3cc831382dec41b8d9a06d93ef300352" copyright="Copyright (C) 2010 Kevin Scannell" id="crubadan" license="GPLv3" name="Crubadan Corpus" size="5288655" subdir="corpora" unzip="1" unzipped_size="11256183" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/crubadan.zip" webpage="http://borel.slu.edu/crubadan/" />
<package checksum="631e959acaa42eea718daf04c5cdfa76" copyright="Copyright (C) 1995 University of Pennsylvania" id="dependency_treebank" license="This is a 10% fragment of Penn Treebank, (C) LDC 1995, which has been dependency parsed. It is made available under fair use for the purposes of illustrating NLTK tools for tokenizing, tagging, chunking and parsing. This data is for non-commercial use only." name="Dependency Parsed Treebank" sample="True" size="457429" subdir="corpora" unzip="1" unzipped_size="1069540" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip" />
<package checksum="6f9c042774b96366c93fd0f9a9adb697" id="dolch" name="Dolch Word List" size="2116" subdir="corpora" unzip="1" unzipped_size"1917" url="https://en.wikipedia.org/wiki/Dolch_word_list" />
<package author="Philipp Koehn, University of Edinburgh" checksum="7621d5675990b1decc012c823716ee76" id="europarl_raw" name="Sample European Parliament Proceedings Parallel Corpus" size="12594977" subdir="corpora" unzip="1" unzipped_size="41396100" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/europarl_raw.zip" webpage="http://www.statmt.org/europarl" />
<package checksum="de5f1df09949f080e0f616f0bc55967d" id="floresta" license="Non-commercial use only" name="Portuguese Treebank" size="1882021" subdir="corpora" unzip="1" unzipped_size="16414136" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/floresta.zip" webpage="http://www.linguateca.pt/Floresta/" />
<package author="Collin F. Baker" checksum="cf68365950b2f048bcb48619de81f50a" id="framenet_v15" license="May be used for non-commercial purposes." name="FrameNet 1.5" size="69337891" subdir="corpora" unzip="1" unzipped_size="579133737" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/framenet_v15.zip" webpage="http://framenet.icsi.berkeley.edu" />
Expand Down Expand Up @@ -128,6 +129,7 @@
<item ref="conll2007" />
<item ref="crubadan" />
<item ref="dependency_treebank" />
<item ref="dolch" />
<item ref="floresta" />
<item ref="framenet_v15" />
<item ref="framenet_v17" />
Expand Down Expand Up @@ -204,6 +206,7 @@
<item ref="conll2007" />
<item ref="crubadan" />
<item ref="dependency_treebank" />
<item ref="dolch" />
<item ref="europarl_raw" />
<item ref="floresta" />
<item ref="framenet_v15" />
Expand Down
4 changes: 4 additions & 0 deletions packages/corpora/dolch.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<package id="dolch" name="Dolch Word List"
webpage="https://en.wikipedia.org/wiki/Dolch_word_list"
unzip="1"
/>
Binary file added packages/corpora/dolch.zip
Binary file not shown.
1 change: 1 addition & 0 deletions packages/corpora/listing.csv
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ chat80,chat80,Chat-80 Database,,,chat80.py,University of Pennsylvania,,http://ww
cmudict,cmudict,Carnegie Mellon Pronouncing Dictionary,,,cmudict.py,,,,,
conll2000,conll2000,CoNLL 2000 Chunking Corpus,,,conll2000.py,,,,,
conll2002,conll2002,CoNLL 2002 NER Corpus,Dutch::Spanish,,conll2002.py,,,,,
dolch,dolch,Dolch Word List,,,,,,,,
genesis,genesis,Genesis Corpus,,,genesis.py,,,,,
gutenberg,gutenberg,Project Gutenberg Selections,,,gutenberg.py,,,,,
ieer,ieer,NIST 1999 Information Extraction ,Entity Recognition Corpus,,ieer.py,,,,,
Expand Down

0 comments on commit e93c1eb

Please sign in to comment.