Skip to content

Commit

Permalink
refactor: wikipediaのスクレイピングsampleファイルの修正
Browse files Browse the repository at this point in the history
refactor: wikipediaのスクレイピングsampleファイルの修正
  • Loading branch information
account0248 committed May 27, 2022
1 parent b03e8e9 commit 503cf4a
Show file tree
Hide file tree
Showing 2 changed files with 164 additions and 9 deletions.
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# 設定ファイルとメタデータ
.ipynb_checkpoints
.venv
requirements_[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].txt
.vscode

# setupスクリプトの実行で出力されるBKファイル
requirements_[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]_[0-9][0-9][0-9][0-9][0-9][0-9].txt

# sampleコードの実行で出力されるcsvファイル
sample/*.csv
164 changes: 156 additions & 8 deletions sample/today_wikipedia.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,25 @@
"metadata": {},
"outputs": [],
"source": [
"# example[1]:データ取得の流れ\n",
"\n",
"# wikipediaの「今日はなんの日」の情報をスクレイピングする\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import bs4\n",
"\n",
"# スクレイピング対象のURL\n",
"url = \"https://ja.wikipedia.org\"\n",
"\n",
"# HTTPリクエストを行い、HTTPレスポンスを取得\n",
"response = requests.get(url)\n",
"#print(response)\n",
"\n",
"soup = BeautifulSoup(response.content, \"html.parser\")\n",
"#print(soup)\n",
"# Beautiful Soupでタグ単位に解析\n",
"soup = bs4.BeautifulSoup(response.content, \"html.parser\")\n",
"\n",
"# devタグのidがon_this_dayである要素(今日は何の日)のtextを取得\n",
"today = soup.find(\"div\", attrs={\"id\": \"on_this_day\"})\n",
"\n",
"today = soup.find(\"div\", attrs={\"id\": \"on_this_day\"}).text\n",
"print(today)"
"print(today.text)"
]
},
{
Expand All @@ -28,15 +34,157 @@
"id": "840177ff",
"metadata": {},
"outputs": [],
"source": []
"source": [
"# example[2]:スクレイピング結果への採番\n",
"\n",
"# wikipediaの「今日はなんの日」の情報をスクレイピングする\n",
"# スクレイピング結果に連番を採番する\n",
"import requests\n",
"import bs4\n",
"\n",
"# スクレイピング対象のURL\n",
"url = \"https://ja.wikipedia.org\"\n",
"\n",
"# HTTPリクエストを行い、HTTPレスポンスを取得\n",
"response = requests.get(url)\n",
"\n",
"# Beautiful Soupでタグ単位に解析\n",
"soup = bs4.BeautifulSoup(response.content, \"html.parser\")\n",
"\n",
"# devタグのidがon_this_dayである要素を取得(今日は何の日)\n",
"today = soup.find(\"div\", attrs={\"id\": \"on_this_day\"})\n",
"\n",
"# todayオブジェクト(今日は何の日)からliタグの要素を1件1リストとして取得\n",
"entries = today.find_all(\"li\")\n",
"\n",
"# 1件1リストとして取得した「今日は何の日」をenumerate()を使って、番号付きで取り出す\n",
"for i, entry in enumerate(entries):\n",
" # 文字列のフォーマットを指定して出力(書式化演算子)\n",
" # print(\"%d: %s\" % (i + 1, entry.get_text()))\n",
"\n",
" # 文字列のフォーマットを指定して出力(python3.x以降の構文)\n",
" # print(\"{}: {}\".format(i + 1, entry.get_text()))\n",
" \n",
" # 文字列のフォーマットを指定して出力(python3.6以降の構文)\n",
" print(f\"{i + 1}: {entry.get_text()}\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f802be39",
"metadata": {},
"outputs": [],
"source": [
"# example[3]:スクレイピング結果の出力\n",
"\n",
"# wikipediaの「今日はなんの日」の情報をスクレイピングする\n",
"# スクレイピング結果に連番を採番する\n",
"# スクレイピング結果をCSVに保存する\n",
"import requests\n",
"import bs4\n",
"import csv\n",
"\n",
"# スクレイピング対象のURL\n",
"url = \"https://ja.wikipedia.org\"\n",
"\n",
"# HTTPリクエストを行い、HTTPレスポンスを取得\n",
"response = requests.get(url)\n",
"\n",
"# Beautiful Soupでタグ単位に解析\n",
"soup = bs4.BeautifulSoup(response.content, \"html.parser\")\n",
"\n",
"# devタグのidがon_this_dayである要素を取得(今日は何の日)\n",
"today = soup.find(\"div\", attrs={\"id\": \"on_this_day\"})\n",
"\n",
"# csv出力用のリスト\n",
"today_list = []\n",
"\n",
"# todayオブジェクト(今日は何の日)からliタグの要素を1件1リストとして取得\n",
"entries = today.find_all(\"li\")\n",
"\n",
"# 1件1リストとして取得した「今日は何の日」をenumerate()を使って、番号付きで取り出す\n",
"for i, entry in enumerate(entries):\n",
" # 文字列のフォーマットを指定して出力(python3.6以降の構文)\n",
" today_list.append([\n",
" i + 1, \n",
" entry.get_text()\n",
" ])\n",
"\n",
"# csvファイルへの出力\n",
"with open(\"wiki_exsample3.csv\", \"w\", encoding=\"Shift_JIS\") as file:\n",
" writer = csv.writer(file, lineterminator=\"\\n\")\n",
" writer.writerows(today_list)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3f210187",
"metadata": {},
"outputs": [],
"source": [
"# example[4]:年号の取得\n",
"\n",
"# wikipediaの「今日はなんの日」の情報をスクレイピングする\n",
"# スクレイピング結果に連番を採番する\n",
"# スクレイピング結果から、年号を取り出す\n",
"# スクレイピング結果をCSVに保存する\n",
"import requests\n",
"import bs4\n",
"import csv\n",
"import re\n",
"\n",
"# スクレイピング対象のURL\n",
"url = \"https://ja.wikipedia.org\"\n",
"\n",
"# HTTPリクエストを行い、HTTPレスポンスを取得\n",
"response = requests.get(url)\n",
"\n",
"# Beautiful Soupでタグ単位に解析\n",
"soup = bs4.BeautifulSoup(response.content, \"html.parser\")\n",
"\n",
"# devタグのidがon_this_dayである要素を取得(今日は何の日)\n",
"top_entry = soup.find(\"div\", attrs={\"id\": \"on_this_day\"})\n",
"\n",
"# todayオブジェクト(今日は何の日)からliタグの要素を1件1リストとして取得\n",
"entries = top_entry.find_all(\"li\")\n",
"\n",
"# csv出力用のリスト\n",
"today_list = []\n",
"\n",
"# 1件1リストとして取得した「今日は何の日」をenumerate()を使って、番号付きで取り出す\n",
"for i, entry in enumerate(entries):\n",
" # 括弧の書式を全て半角で合わせる\n",
" today_text = entry.get_text().replace(\"\", \"(\").replace(\"\", \")\")\n",
"\n",
" # reライブラリのsearch()で、(****年)で囲まれるテキストを取得する\n",
" match = re.search(\"\\\\(([1-9].*?)年\\\\)\", today_text)\n",
" if match:\n",
" today_list.append([\n",
" i + 1, \n",
" entry.get_text(), \n",
" match.group(1)\n",
" ])\n",
" else:\n",
" today_list.append([\n",
" i + 1, \n",
" entry.get_text()\n",
" ])\n",
"\n",
"# csvファイルへの出力\n",
"with open(\"wiki_exsample4.csv\", \"w\", encoding=\"Shift_JIS\") as file:\n",
" writer = csv.writer(file, lineterminator=\"\\n\")\n",
" writer.writerows(today_list)"
]
}
],
"metadata": {
"interpreter": {
"hash": "bf8ffbffa42428e6ca2d839fe22706e99c2b4f72a7a8a3675ca21d7d875b7aeb"
},
"kernelspec": {
"display_name": "Python 3.10.1 ('.venv': venv)",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand Down

0 comments on commit 503cf4a

Please sign in to comment.