Skip to content

Commit

Permalink
fix(cmd/fetch): 修正东莞市这种特殊的地区数据拉取
Browse files Browse the repository at this point in the history
  • Loading branch information
caixw committed Feb 8, 2023
1 parent f8af91e commit d4e6363
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 14 deletions.
3 changes: 1 addition & 2 deletions cmd/fetch/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,10 @@ func buildCollector(base string) (*colly.Collector, error) {
),
colly.UserAgent(userAgent),
colly.DetectCharset(),
colly.Async(true),
colly.AllowURLRevisit(),
)

rule := &colly.LimitRule{Parallelism: 100, DomainGlob: "*", RandomDelay: time.Second}
rule := &colly.LimitRule{Parallelism: 100, DomainGlob: "*", RandomDelay: 800 * time.Millisecond}
if err := c.Limit(rule); err != nil {
return nil, err
}
Expand Down
45 changes: 34 additions & 11 deletions cmd/fetch/fetch.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package main

import (
"errors"
"fmt"
"os"
"path/filepath"
Expand All @@ -20,6 +21,8 @@ const baseURL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/"

var digit = regexp.MustCompile("[0-9]+")

var errNoData = errors.New("no data")

// 拉取指定年份的数据
//
// years 为指定的一个或多个年份,如果为空,则表示所有的年份。
Expand Down Expand Up @@ -88,7 +91,7 @@ func fetchYear(dir string, interval time.Duration, year int) error {
}
fmt.Println(colorsSprintf(colors.Green, "拉取 %d 年份的省级数据完成,总共 %d 条\n", year, len(provinces)))

f, err := os.Create(dir + "/../error.log")
f, err := os.Create(dir + "/../" + y + "-error.log")
if err != nil {
return err
}
Expand Down Expand Up @@ -126,8 +129,8 @@ func fetchProvince(dir, base string, p *item) error {

cities := make([]*item, 0, 500)
c.OnHTML(".citytable .citytr td a", func(e *colly.HTMLElement) {
href := strings.TrimSuffix(e.Attr("href"), ".html")
cities = append(cities, &item{id: href, text: e.Text})
id := strings.TrimSuffix(e.Attr("href"), ".html")
cities = append(cities, &item{id: id, text: e.Text})
})

if err := c.Visit(base + p.id + ".html"); err != nil {
Expand All @@ -145,7 +148,16 @@ func fetchProvince(dir, base string, p *item) error {
continue
}

if err := fetchCity(fs, base, city); err != nil {
err = fetchCity(fs, base, city)
switch {
case errors.Is(err, errNoData):
if err1 := fetchCounty(fs, base, city); err1 != nil { // 广东省 东莞
if errors.Is(err1, errNoData) {
err1 = fmt.Errorf("未获取到 %s:%s 的县/乡镇数据", city.id, city.text)
}
return err1
}
case err != nil:
return err
}
}
Expand All @@ -163,8 +175,8 @@ func fetchCity(fs *provinceFile, base string, p *item) error {

counties := make([]*item, 0, 500)
c.OnHTML(".countytable .countytr td a", func(e *colly.HTMLElement) {
href := strings.TrimSuffix(e.Attr("href"), ".html")
counties = append(counties, &item{id: href, text: e.Text})
id := strings.TrimSuffix(e.Attr("href"), ".html")
counties = append(counties, &item{id: id, text: e.Text})
})

if err := c.Visit(base + p.id + ".html"); err != nil {
Expand All @@ -173,7 +185,7 @@ func fetchCity(fs *provinceFile, base string, p *item) error {
c.Wait()

if len(counties) == 0 {
return fmt.Errorf("未获取到 %s:%s 的县级数据", p.id, p.text)
return errNoData
}
fmt.Println(colorsSprintf(colors.Green, "拉取 %s 的县级数据完成,总共 %d 条\n", p.text, len(counties)))

Expand All @@ -183,6 +195,9 @@ func fetchCity(fs *provinceFile, base string, p *item) error {
}

if err := fetchCounty(fs, base+firstID(p.id)+"/", county); err != nil {
if errors.Is(err, errNoData) {
err = fmt.Errorf("未获取到 %s:%s 的乡镇数据", county.id, county.text)
}
return err
}
}
Expand All @@ -199,8 +214,13 @@ func fetchCounty(fs *provinceFile, base string, p *item) error {

towns := make([]*item, 0, 500)
c.OnHTML(".towntable .towntr td a", func(e *colly.HTMLElement) {
href := strings.TrimSuffix(e.Attr("href"), ".html")
towns = append(towns, &item{id: href, text: e.Text})
id := strings.TrimSuffix(e.Attr("href"), ".html")
towns = append(towns, &item{id: id, text: e.Text})
})

c.OnHTML(".countytable .towntr td a", func(e *colly.HTMLElement) { // 2021 之后的东莞等
id := strings.TrimSuffix(e.Attr("href"), ".html")
towns = append(towns, &item{id: id, text: e.Text})
})

if err := c.Visit(base + p.id + ".html"); err != nil {
Expand All @@ -209,7 +229,7 @@ func fetchCounty(fs *provinceFile, base string, p *item) error {
c.Wait()

if len(towns) == 0 {
return fmt.Errorf("未获取到 %s:%s 的乡镇数据", p.id, p.text)
return errNoData
}
fmt.Println(colorsSprintf(colors.Green, "拉取 %s 的乡镇数据完成,总共 %d 条\n", p.text, len(towns)))

Expand All @@ -219,6 +239,9 @@ func fetchCounty(fs *provinceFile, base string, p *item) error {
}

if err := fetchTown(fs, base+firstID(p.id)+"/", town); err != nil {
if errors.Is(err, errNoData) {
err = fmt.Errorf("未获取到 %s:%s 的街道数据", town.id, town.text)
}
return err
}
}
Expand Down Expand Up @@ -253,7 +276,7 @@ func fetchTown(fs *provinceFile, base string, p *item) error {
c.Wait()

if count == 0 {
return fmt.Errorf("未获取到 %s:%s 的街道数据", p.id, p.text)
return errNoData
}
fmt.Print(colorsSprintf(colors.Green, "拉取 %s 的街道数据完成,总共 %d 条\n", p.text, count))
return nil
Expand Down
2 changes: 1 addition & 1 deletion cmd/fetch/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func main() {
fetchFS := opt.New("fetch", "拉取数据\n", doFetch)
fetchFS.StringVar(&fetchDataDir, "data", "./data", "指定数据的保存目录")
fetchFS.StringVar(&fetchYears, "years", "", "指定年份,空值表示所有年份。格式 y1,y2。")
fetchFS.StringVar(&fetchInterval, "internal", "5m", "每拉取一个省份数据后的间隔时间。")
fetchFS.StringVar(&fetchInterval, "internal", "1m", "每拉取一个省份数据后的间隔时间。")

buildFS := opt.New("build", "生成数据\n", doBuild)
buildFS.StringVar(&buildDataDir, "data", "", "指定数据目录")
Expand Down

0 comments on commit d4e6363

Please sign in to comment.