Skip to content

Commit e93c1eb

Browse files
authored
Merge pull request nltk#58 from simonrichard/gh-pages
Add Dolch word list
2 parents 1164703 + 1d6ca44 commit e93c1eb

File tree

6 files changed

+10
-0
lines changed

6 files changed

+10
-0
lines changed

collections/all-corpora.xml

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
<item ref="conll2007"/>
1616
<item ref="crubadan"/>
1717
<item ref="dependency_treebank"/>
18+
<item ref="dolch"/>
1819
<item ref="floresta"/>
1920
<item ref="framenet_v15"/>
2021
<item ref="framenet_v17"/>

collections/all.xml

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
<item ref="conll2007"/>
1717
<item ref="crubadan"/>
1818
<item ref="dependency_treebank"/>
19+
<item ref="dolch"/>
1920
<item ref="europarl_raw"/>
2021
<item ref="floresta"/>
2122
<item ref="framenet_v15"/>

index.xml

+3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
<package checksum="b9015928e35c41f0695525289df5208f" contact="Kepa Sarasola" copyright="Copyright (C) 2007 The University of the Basque Country" id="conll2007" license="Creative Commons Attribution-NonCommercial-NoDerivativeWorks license" name="Dependency Treebanks from CoNLL 2007 (Catalan and Basque Subset)" size="1242958" subdir="corpora" unzip="0" unzipped_size="6399295" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/conll2007.zip" webpage="http://nextens.uvt.nl/depparse-wiki/DataDownload" />
2121
<package author="Kevin Scannell" checksum="3cc831382dec41b8d9a06d93ef300352" copyright="Copyright (C) 2010 Kevin Scannell" id="crubadan" license="GPLv3" name="Crubadan Corpus" size="5288655" subdir="corpora" unzip="1" unzipped_size="11256183" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/crubadan.zip" webpage="http://borel.slu.edu/crubadan/" />
2222
<package checksum="631e959acaa42eea718daf04c5cdfa76" copyright="Copyright (C) 1995 University of Pennsylvania" id="dependency_treebank" license="This is a 10% fragment of Penn Treebank, (C) LDC 1995, which has been dependency parsed. It is made available under fair use for the purposes of illustrating NLTK tools for tokenizing, tagging, chunking and parsing. This data is for non-commercial use only." name="Dependency Parsed Treebank" sample="True" size="457429" subdir="corpora" unzip="1" unzipped_size="1069540" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip" />
23+
<package checksum="6f9c042774b96366c93fd0f9a9adb697" id="dolch" name="Dolch Word List" size="2116" subdir="corpora" unzip="1" unzipped_size"1917" url="https://en.wikipedia.org/wiki/Dolch_word_list" />
2324
<package author="Philipp Koehn, University of Edinburgh" checksum="7621d5675990b1decc012c823716ee76" id="europarl_raw" name="Sample European Parliament Proceedings Parallel Corpus" size="12594977" subdir="corpora" unzip="1" unzipped_size="41396100" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/europarl_raw.zip" webpage="http://www.statmt.org/europarl" />
2425
<package checksum="de5f1df09949f080e0f616f0bc55967d" id="floresta" license="Non-commercial use only" name="Portuguese Treebank" size="1882021" subdir="corpora" unzip="1" unzipped_size="16414136" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/floresta.zip" webpage="http://www.linguateca.pt/Floresta/" />
2526
<package author="Collin F. Baker" checksum="cf68365950b2f048bcb48619de81f50a" id="framenet_v15" license="May be used for non-commercial purposes." name="FrameNet 1.5" size="69337891" subdir="corpora" unzip="1" unzipped_size="579133737" url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/framenet_v15.zip" webpage="http://framenet.icsi.berkeley.edu" />
@@ -128,6 +129,7 @@
128129
<item ref="conll2007" />
129130
<item ref="crubadan" />
130131
<item ref="dependency_treebank" />
132+
<item ref="dolch" />
131133
<item ref="floresta" />
132134
<item ref="framenet_v15" />
133135
<item ref="framenet_v17" />
@@ -204,6 +206,7 @@
204206
<item ref="conll2007" />
205207
<item ref="crubadan" />
206208
<item ref="dependency_treebank" />
209+
<item ref="dolch" />
207210
<item ref="europarl_raw" />
208211
<item ref="floresta" />
209212
<item ref="framenet_v15" />

packages/corpora/dolch.xml

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<package id="dolch" name="Dolch Word List"
2+
webpage="https://en.wikipedia.org/wiki/Dolch_word_list"
3+
unzip="1"
4+
/>

packages/corpora/dolch.zip

2.07 KB
Binary file not shown.

packages/corpora/listing.csv

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ chat80,chat80,Chat-80 Database,,,chat80.py,University of Pennsylvania,,http://ww
77
cmudict,cmudict,Carnegie Mellon Pronouncing Dictionary,,,cmudict.py,,,,,
88
conll2000,conll2000,CoNLL 2000 Chunking Corpus,,,conll2000.py,,,,,
99
conll2002,conll2002,CoNLL 2002 NER Corpus,Dutch::Spanish,,conll2002.py,,,,,
10+
dolch,dolch,Dolch Word List,,,,,,,,
1011
genesis,genesis,Genesis Corpus,,,genesis.py,,,,,
1112
gutenberg,gutenberg,Project Gutenberg Selections,,,gutenberg.py,,,,,
1213
ieer,ieer,NIST 1999 Information Extraction ,Entity Recognition Corpus,,ieer.py,,,,,

0 commit comments

Comments
 (0)