Skip to content

Commit

Permalink
Generate tsv instead of txt via discussion in gitter
Browse files Browse the repository at this point in the history
  • Loading branch information
lepture committed Dec 17, 2015
1 parent f267fc9 commit 4f4c8eb
Show file tree
Hide file tree
Showing 36 changed files with 59,879 additions and 59,855 deletions.
3,510 changes: 0 additions & 3,510 deletions 2002.txt

This file was deleted.

3,511 changes: 3,511 additions & 0 deletions 200212.tsv

Large diffs are not rendered by default.

3,522 changes: 0 additions & 3,522 deletions 2003.txt

This file was deleted.

3,515 changes: 3,515 additions & 0 deletions 200306.tsv

Large diffs are not rendered by default.

3,514 changes: 0 additions & 3,514 deletions 200306.txt

This file was deleted.

3,523 changes: 3,523 additions & 0 deletions 200312.tsv

Large diffs are not rendered by default.

3,523 changes: 0 additions & 3,523 deletions 2004.txt

This file was deleted.

3,524 changes: 3,524 additions & 0 deletions 200403.tsv

Large diffs are not rendered by default.

3,523 changes: 0 additions & 3,523 deletions 200403.txt

This file was deleted.

3,524 changes: 3,524 additions & 0 deletions 200409.tsv

Large diffs are not rendered by default.

3,523 changes: 0 additions & 3,523 deletions 200409.txt

This file was deleted.

3,524 changes: 3,524 additions & 0 deletions 200412.tsv

Large diffs are not rendered by default.

3,523 changes: 0 additions & 3,523 deletions 2005.txt

This file was deleted.

3,524 changes: 3,524 additions & 0 deletions 200506.tsv

Large diffs are not rendered by default.

3,523 changes: 0 additions & 3,523 deletions 200506.txt

This file was deleted.

3,524 changes: 3,524 additions & 0 deletions 200512.tsv

Large diffs are not rendered by default.

3,524 changes: 0 additions & 3,524 deletions 2006.txt

This file was deleted.

3,525 changes: 3,525 additions & 0 deletions 200612.tsv

Large diffs are not rendered by default.

3,524 changes: 0 additions & 3,524 deletions 2007.txt

This file was deleted.

3,525 changes: 3,525 additions & 0 deletions 200712.tsv

Large diffs are not rendered by default.

3,525 changes: 0 additions & 3,525 deletions 2008.txt

This file was deleted.

3,526 changes: 3,526 additions & 0 deletions 200812.tsv

Large diffs are not rendered by default.

3,520 changes: 0 additions & 3,520 deletions 2009.txt

This file was deleted.

3,521 changes: 3,521 additions & 0 deletions 200912.tsv

Large diffs are not rendered by default.

3,518 changes: 0 additions & 3,518 deletions 2010.txt

This file was deleted.

3,519 changes: 3,519 additions & 0 deletions 201010.tsv

Large diffs are not rendered by default.

3,514 changes: 0 additions & 3,514 deletions 2011.txt

This file was deleted.

3,515 changes: 3,515 additions & 0 deletions 201110.tsv

Large diffs are not rendered by default.

3,510 changes: 0 additions & 3,510 deletions 2012.txt

This file was deleted.

3,511 changes: 3,511 additions & 0 deletions 201210.tsv

Large diffs are not rendered by default.

3,515 changes: 0 additions & 3,515 deletions 2013.txt

This file was deleted.

3,516 changes: 3,516 additions & 0 deletions 201308.tsv

Large diffs are not rendered by default.

3,512 changes: 0 additions & 3,512 deletions 2014.txt

This file was deleted.

3,513 changes: 3,513 additions & 0 deletions 201410.tsv

Large diffs are not rendered by default.

30 changes: 15 additions & 15 deletions revisions.json
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
{
"GB2260": [
"2007",
"2002"
"200712",
"200212"
],
"revisions": [
"2014",
"2013",
"2012",
"2011",
"2010",
"2009",
"2008",
"2007",
"2006",
"2005",
"201410",
"201308",
"201210",
"201110",
"201010",
"200912",
"200812",
"200712",
"200612",
"200512",
"200506",
"2004",
"200412",
"200409",
"200403",
"2003",
"200312",
"200306",
"2002"
"200212"
]
}
41 changes: 24 additions & 17 deletions scripts/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,24 @@

URL_BASE = 'http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/'
URL_LIST = [
# (year, url, is_mass)
(2014, '201504/t20150415_712722.html', False),
(2013, '201401/t20140116_501070.html', False),
(2012, '201301/t20130118_38316.html', False),
(2011, '201201/t20120105_38315.html', False),
(2010, '201107/t20110726_38314.html', False),
(2009, '201006/t20100623_38313.html', False),
(2008, '200906/t20090626_38312.html', False),
(2007, '200802/t20080215_38311.html', False),
(2006, '200704/t20070411_38310.html', False),
(2005, '200410/t20041022_38307.html', True),
# (revision, url, is_mass)
(201410, '201504/t20150415_712722.html', False),
(201308, '201401/t20140116_501070.html', False),
(201210, '201301/t20130118_38316.html', False),
(201110, '201201/t20120105_38315.html', False),
(201010, '201107/t20110726_38314.html', False),
(200912, '201006/t20100623_38313.html', False),
(200812, '200906/t20090626_38312.html', False),
(200712, '200802/t20080215_38311.html', False),
(200612, '200704/t20070411_38310.html', False),
(200512, '200410/t20041022_38307.html', True),
(200506, '200410/t20041022_38306.html', True),
(2004, '200410/t20041022_38305.html', True),
(200412, '200410/t20041022_38305.html', True),
(200409, '200410/t20041022_38304.html', True),
(200403, '200406/t20040607_38302.html', True),
(2003, '200402/t20040211_38301.html', True),
(200312, '200402/t20040211_38301.html', True),
(200306, '200307/t20030722_38300.html', True),
(2002, '200302/t20030219_38299.html', True),
(200212, '200302/t20030219_38299.html', True),
]

XPATH_EXPRS = [
Expand All @@ -42,6 +42,8 @@
'.//p[@class="MsoNormal"]//span//text()',
]

SAC = [200212, 200712]


def strip_spaces_in_chinese_words(line):
cjk_chars = u'\u3007\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff'
Expand Down Expand Up @@ -93,15 +95,20 @@ def main():
print('Usage: %s [dir]' % sys.argv[0], file=sys.stderr)
sys.exit(0)

for suffix, url, is_mass in URL_LIST:
for revision, url, is_mass in URL_LIST:
req = requests.get(URL_BASE + url)
req.encoding = 'utf-8'
el = fromstring(req.text)

dirname = os.path.join(sys.argv[1], '%s.txt' % suffix)
dirname = os.path.join(sys.argv[1], '%s.tsv' % revision)
print('--> %s' % dirname, file=sys.stderr)

source = 'stats'
if revision in SAC:
source = 'sac'

with open(dirname, 'w') as dest_file:
print(b'Source\tRevision\tCode\tName', file=dest_file)
for line in iter_lines(el, is_mass):
text = strip_spaces_in_chinese_words(strip_comments(line))
if not text:
Expand All @@ -113,7 +120,7 @@ def main():
except ValueError:
print('ignored: %s' % text, file=sys.stderr)
else:
out = '%s\t%s' % (code, name)
out = '%s\t%s\t%s\t%s' % (source, revision, code, name)
print(out.encode('utf-8'), file=dest_file)


Expand Down

0 comments on commit 4f4c8eb

Please sign in to comment.