From d4e63637b594049ceb15eec738e0ae798bcf56c2 Mon Sep 17 00:00:00 2001 From: caixw Date: Wed, 8 Feb 2023 10:07:23 +0800 Subject: [PATCH] =?UTF-8?q?fix(cmd/fetch):=20=E4=BF=AE=E6=AD=A3=E4=B8=9C?= =?UTF-8?q?=E8=8E=9E=E5=B8=82=E8=BF=99=E7=A7=8D=E7=89=B9=E6=AE=8A=E7=9A=84?= =?UTF-8?q?=E5=9C=B0=E5=8C=BA=E6=95=B0=E6=8D=AE=E6=8B=89=E5=8F=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmd/fetch/collector.go | 3 +-- cmd/fetch/fetch.go | 45 +++++++++++++++++++++++++++++++----------- cmd/fetch/main.go | 2 +- 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/cmd/fetch/collector.go b/cmd/fetch/collector.go index 3965002..0594958 100644 --- a/cmd/fetch/collector.go +++ b/cmd/fetch/collector.go @@ -85,11 +85,10 @@ func buildCollector(base string) (*colly.Collector, error) { ), colly.UserAgent(userAgent), colly.DetectCharset(), - colly.Async(true), colly.AllowURLRevisit(), ) - rule := &colly.LimitRule{Parallelism: 100, DomainGlob: "*", RandomDelay: time.Second} + rule := &colly.LimitRule{Parallelism: 100, DomainGlob: "*", RandomDelay: 800 * time.Millisecond} if err := c.Limit(rule); err != nil { return nil, err } diff --git a/cmd/fetch/fetch.go b/cmd/fetch/fetch.go index 22beea1..c98f7d6 100644 --- a/cmd/fetch/fetch.go +++ b/cmd/fetch/fetch.go @@ -3,6 +3,7 @@ package main import ( + "errors" "fmt" "os" "path/filepath" @@ -20,6 +21,8 @@ const baseURL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/" var digit = regexp.MustCompile("[0-9]+") +var errNoData = errors.New("no data") + // 拉取指定年份的数据 // // years 为指定的一个或多个年份,如果为空,则表示所有的年份。 @@ -88,7 +91,7 @@ func fetchYear(dir string, interval time.Duration, year int) error { } fmt.Println(colorsSprintf(colors.Green, "拉取 %d 年份的省级数据完成,总共 %d 条\n", year, len(provinces))) - f, err := os.Create(dir + "/../error.log") + f, err := os.Create(dir + "/../" + y + "-error.log") if err != nil { return err } @@ -126,8 +129,8 @@ func fetchProvince(dir, base string, p *item) error { cities := make([]*item, 0, 500) c.OnHTML(".citytable .citytr td a", func(e *colly.HTMLElement) { - href := strings.TrimSuffix(e.Attr("href"), ".html") - cities = append(cities, &item{id: href, text: e.Text}) + id := strings.TrimSuffix(e.Attr("href"), ".html") + cities = append(cities, &item{id: id, text: e.Text}) }) if err := c.Visit(base + p.id + ".html"); err != nil { @@ -145,7 +148,16 @@ func fetchProvince(dir, base string, p *item) error { continue } - if err := fetchCity(fs, base, city); err != nil { + err = fetchCity(fs, base, city) + switch { + case errors.Is(err, errNoData): + if err1 := fetchCounty(fs, base, city); err1 != nil { // 广东省 东莞 + if errors.Is(err1, errNoData) { + err1 = fmt.Errorf("未获取到 %s:%s 的县/乡镇数据", city.id, city.text) + } + return err1 + } + case err != nil: return err } } @@ -163,8 +175,8 @@ func fetchCity(fs *provinceFile, base string, p *item) error { counties := make([]*item, 0, 500) c.OnHTML(".countytable .countytr td a", func(e *colly.HTMLElement) { - href := strings.TrimSuffix(e.Attr("href"), ".html") - counties = append(counties, &item{id: href, text: e.Text}) + id := strings.TrimSuffix(e.Attr("href"), ".html") + counties = append(counties, &item{id: id, text: e.Text}) }) if err := c.Visit(base + p.id + ".html"); err != nil { @@ -173,7 +185,7 @@ func fetchCity(fs *provinceFile, base string, p *item) error { c.Wait() if len(counties) == 0 { - return fmt.Errorf("未获取到 %s:%s 的县级数据", p.id, p.text) + return errNoData } fmt.Println(colorsSprintf(colors.Green, "拉取 %s 的县级数据完成,总共 %d 条\n", p.text, len(counties))) @@ -183,6 +195,9 @@ func fetchCity(fs *provinceFile, base string, p *item) error { } if err := fetchCounty(fs, base+firstID(p.id)+"/", county); err != nil { + if errors.Is(err, errNoData) { + err = fmt.Errorf("未获取到 %s:%s 的乡镇数据", county.id, county.text) + } return err } } @@ -199,8 +214,13 @@ func fetchCounty(fs *provinceFile, base string, p *item) error { towns := make([]*item, 0, 500) c.OnHTML(".towntable .towntr td a", func(e *colly.HTMLElement) { - href := strings.TrimSuffix(e.Attr("href"), ".html") - towns = append(towns, &item{id: href, text: e.Text}) + id := strings.TrimSuffix(e.Attr("href"), ".html") + towns = append(towns, &item{id: id, text: e.Text}) + }) + + c.OnHTML(".countytable .towntr td a", func(e *colly.HTMLElement) { // 2021 之后的东莞等 + id := strings.TrimSuffix(e.Attr("href"), ".html") + towns = append(towns, &item{id: id, text: e.Text}) }) if err := c.Visit(base + p.id + ".html"); err != nil { @@ -209,7 +229,7 @@ func fetchCounty(fs *provinceFile, base string, p *item) error { c.Wait() if len(towns) == 0 { - return fmt.Errorf("未获取到 %s:%s 的乡镇数据", p.id, p.text) + return errNoData } fmt.Println(colorsSprintf(colors.Green, "拉取 %s 的乡镇数据完成,总共 %d 条\n", p.text, len(towns))) @@ -219,6 +239,9 @@ func fetchCounty(fs *provinceFile, base string, p *item) error { } if err := fetchTown(fs, base+firstID(p.id)+"/", town); err != nil { + if errors.Is(err, errNoData) { + err = fmt.Errorf("未获取到 %s:%s 的街道数据", town.id, town.text) + } return err } } @@ -253,7 +276,7 @@ func fetchTown(fs *provinceFile, base string, p *item) error { c.Wait() if count == 0 { - return fmt.Errorf("未获取到 %s:%s 的街道数据", p.id, p.text) + return errNoData } fmt.Print(colorsSprintf(colors.Green, "拉取 %s 的街道数据完成,总共 %d 条\n", p.text, count)) return nil diff --git a/cmd/fetch/main.go b/cmd/fetch/main.go index caa7501..492c1f7 100644 --- a/cmd/fetch/main.go +++ b/cmd/fetch/main.go @@ -39,7 +39,7 @@ func main() { fetchFS := opt.New("fetch", "拉取数据\n", doFetch) fetchFS.StringVar(&fetchDataDir, "data", "./data", "指定数据的保存目录") fetchFS.StringVar(&fetchYears, "years", "", "指定年份,空值表示所有年份。格式 y1,y2。") - fetchFS.StringVar(&fetchInterval, "internal", "5m", "每拉取一个省份数据后的间隔时间。") + fetchFS.StringVar(&fetchInterval, "internal", "1m", "每拉取一个省份数据后的间隔时间。") buildFS := opt.New("build", "生成数据\n", doBuild) buildFS.StringVar(&buildDataDir, "data", "", "指定数据目录")