-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
19,368 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
{"cells":[{"cell_type":"markdown","metadata":{"id":"sTj_1PDQTp6N"},"source":["# 1. Tải và cài đặt một số thư viện cho crawler"]},{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":589,"status":"ok","timestamp":1721071976795,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"FWq4ewjPThIE"},"outputs":[],"source":["import numpy as np\n","import pandas as pd\n","import csv\n","import newspaper\n","from newspaper import Article as Atc\n","from newspaper import Source as Scr"]},{"cell_type":"markdown","metadata":{"id":"2BHYvnzK28rJ"},"source":["# 2. Tải tất cả đường link và lọc ra những đường link chỉ chứa html:"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":7442,"status":"ok","timestamp":1721071984236,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"j0pqu0CJTz1N","outputId":"0b3a9f81-a429-40aa-9e23-cd782b4a587f"},"outputs":[],"source":["vnex_paper = newspaper.build('https://vnexpress.net/', language = 'vi', memoize_articles = False, follow_meta_refresh = True, keep_article_html=True)"]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":22,"status":"ok","timestamp":1721071984237,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"4mNGE-3Ztd1p","outputId":"0a778869-387d-4106-9781-95ecfcf18899"},"outputs":[{"name":"stdout","output_type":"stream","text":["Co tat ca 1705 den tu https://vnexpress.net/\n"]}],"source":["size = vnex_paper.size()\n","print(f\"Co tat ca {size} den tu https://vnexpress.net/\")"]},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1337,"status":"ok","timestamp":1721072037393,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"izDill-ko_OI","outputId":"f31ac0f5-6bcd-4740-d1b5-22acbf483d59"},"outputs":[{"name":"stdout","output_type":"stream","text":["Xuat hien 848 bai bao trong vnexpress\n"]}],"source":["valid_url = np.array([],np.string_)\n","for i in range(size):\n"," vnex_url = vnex_paper.article_urls()[i]\n"," # fillter out only-html link\n"," temp = ''\n"," for j in range(len(vnex_url)):\n"," temp = vnex_url[len(vnex_url) - j - 1] + temp\n"," if temp == 'htm' or temp == 'html':\n"," valid_url = np.append(valid_url,vnex_url)\n","print(f\"Xuat hien {np.shape(valid_url)[0]} bai bao trong vnexpress\")"]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cySL0POqqEM0","outputId":"58ab9a3a-4375-499d-da7b-7446104018a4"},"outputs":[],"source":["star = np.array(['Date', 'Content', 'Source'],np.string_)\n","for i in range(np.shape(valid_url)[0]):\n"," vnex_url = Atc(valid_url[i],language = 'vi')\n"," vnex_url.download()\n"," vnex_url.parse()\n"," vnex_url_cat = vnex_url.meta_data['tt_list_folder_name']\n"," try:\n"," vnex_url_cat = vnex_url_cat.split(',')\n"," except Exception:\n"," pass\n"," else:\n"," for j in vnex_url_cat:\n"," if j == 'Giới sao':\n"," row = [vnex_url.meta_data['lastmod'],vnex_url.text,vnex_url.url]\n"," star = np.append(star, row, axis = 0)\n"," break"]},{"cell_type":"markdown","metadata":{},"source":["# 3. Lưu lại dữ liệu với định dạng Date,Content,Source:"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[],"source":["star = np.reshape(star,(star.size//3,3))\n","filename = 'vnex_vi_celeb.csv'"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["with open(filename, 'w') as csvFile:\n"," csvWriter = csv.writer(csvFile, delimiter='|', quoting= csv.QUOTE_ALL)\n"," csvWriter.writerows(star)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" Date \\\n","0 2024-07-16T00:00:00+07:00 \n","1 2024-07-15T11:14:39+07:00 \n","2 2024-07-15T10:30:27+07:00 \n","3 2024-07-15T09:53:09+07:00 \n","4 2024-07-15T06:55:47+07:00 \n","5 2024-07-14T19:30:00+07:00 \n","6 2024-07-14T14:47:31+07:00 \n","7 2024-07-14T14:01:48+07:00 \n","8 2024-07-14T21:00:00+07:00 \n","9 2024-07-14T20:00:00+07:00 \n","10 2024-07-14T12:54:38+07:00 \n","11 2024-07-13T22:43:55+07:00 \n","12 2024-07-13T00:00:00+07:00 \n","13 2024-07-13T20:08:13+07:00 \n","14 2024-07-13T10:50:04+07:00 \n","15 2024-07-12T19:28:05+07:00 \n","16 2024-07-12T17:09:34+07:00 \n","17 2024-07-12T16:21:09+07:00 \n","18 2024-07-12T12:30:00+07:00 \n","19 2024-07-12T15:48:27+07:00 \n","\n"," Content \\\n","0 Tăng Phúc - giọng ca \"Chỉ là không cùng nhau\" ... \n","1 Theo Daily Mail ngày 14/7, nhiều nghệ sĩ có mặ... \n","2 Cô từng dự thi Miss Universe Thái Lan 2022, và... \n","3 RomaniaWiz Khalifa - rapper nổi tiếng với hit ... \n","4 Diễn viên Shannen Doherty - đóng phim \"Phép th... \n","5 Hình ảnh cô catwalk trong cuộc thi. Kenza Layl... \n","6 Pax Thiên - con trai gốc Việt của minh tinh An... \n","7 Hoàng Tử Thao - ca sĩ, diễn viên được cho là... \n","8 Ca sĩ Mỹ Selena Gomez hẹn hò Benny Blanco - bạ... \n","9 Anh 30 tuổi, tốt nghiệp ngành thanh nhạc nhưng... \n","10 Tuấn Ngọc - người mẫu quen mặt ở làng thời tra... \n","11 Người mẫu Phạm Tuấn Ngọc, 25 tuổi, thắng danh ... \n","12 Gia đình bốn người của diễn viên Bùi Bài Bình ... \n","13 Á hậu Ngọc Thảo tạo dáng với đầm cắt xẻ.\\r\\n\\r... \n","14 MỹDiễn viên Alec Baldwin, 66 tuổi, khóc khi th... \n","15 Nhà sản xuất Miss Universe Vietnam cho rằng Ho... \n","16 Taylor Swift - Travis Kelce được cho chi nhiều... \n","17 Bà nội Jane Pitt bế bé Knox hai tuổi trên ban ... \n","18 Quyền Linh bên hai con gái. Nghệ sĩ tên thật l... \n","19 Quán quân The New Mentor 2023 Lê Thu Trang mặc... \n","\n"," Source \n","0 https://vnexpress.net/tang-phuc-toi-tung-stres... \n","1 https://vnexpress.net/dan-sao-xem-chung-ket-eu... \n","2 https://vnexpress.net/nhan-sac-hoa-hau-hoan-vu... \n","3 https://vnexpress.net/rapper-wiz-khalifa-bi-ba... \n","4 https://vnexpress.net/sao-phep-thuat-qua-doi-4... \n","5 https://vnexpress.net/nhan-sac-hoa-hau-ai-dau-... \n","6 https://vnexpress.net/pax-thien-moi-me-an-toi-... \n","7 https://vnexpress.net/than-tuong-giau-nhat-tru... \n","8 https://vnexpress.net/chuyen-tinh-song-gio-cua... \n","9 https://vnexpress.net/ngoai-hinh-hut-fan-cua-d... \n","10 https://vnexpress.net/sac-voc-tan-mr-world-vie... \n","11 https://vnexpress.net/pham-tuan-ngoc-dang-quan... \n","12 https://vnexpress.net/nghe-si-bui-bai-binh-son... \n","13 https://vnexpress.net/hoa-hau-a-hau-du-chung-k... \n","14 https://vnexpress.net/tai-tu-alec-baldwin-bat-... \n","15 https://vnexpress.net/nsx-miss-universe-vietna... \n","16 https://vnexpress.net/taylor-swift-va-ban-trai... \n","17 https://vnexpress.net/chan-dung-con-trai-ut-cu... \n","18 https://vnexpress.net/net-dep-tuoi-16-cua-con-... \n","19 https://vnexpress.net/thanh-hang-dien-goi-cam-... \n"]}],"source":["df = pd.read_csv(filename,delimiter='|')\n","print(df)"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyOodf1jXMKyWvsOXsFDwups","collapsed_sections":["sTj_1PDQTp6N"],"provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.18"}},"nbformat":4,"nbformat_minor":0} | ||
{"cells":[{"cell_type":"markdown","metadata":{"id":"sTj_1PDQTp6N"},"source":["# 1. Tải và cài đặt một số thư viện cho crawler"]},{"cell_type":"code","execution_count":15,"metadata":{"executionInfo":{"elapsed":589,"status":"ok","timestamp":1721071976795,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"FWq4ewjPThIE"},"outputs":[],"source":["import numpy as np\n","import pandas as pd\n","import csv\n","import newspaper\n","from newspaper import Article as Atc\n","from newspaper import Source as Scr"]},{"cell_type":"markdown","metadata":{"id":"2BHYvnzK28rJ"},"source":["# 2. Tải tất cả đường link và lọc ra những đường link chỉ chứa html:"]},{"cell_type":"code","execution_count":16,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":7442,"status":"ok","timestamp":1721071984236,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"j0pqu0CJTz1N","outputId":"0b3a9f81-a429-40aa-9e23-cd782b4a587f"},"outputs":[],"source":["vnex_paper = newspaper.build('https://vnexpress.net/', language = 'vi', follow_meta_refresh = True, keep_article_html=True)"]},{"cell_type":"code","execution_count":17,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":22,"status":"ok","timestamp":1721071984237,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"4mNGE-3Ztd1p","outputId":"0a778869-387d-4106-9781-95ecfcf18899"},"outputs":[{"name":"stdout","output_type":"stream","text":["Co tat ca 0 den tu https://vnexpress.net/\n"]}],"source":["size = vnex_paper.size()\n","print(f\"Co tat ca {size} den tu https://vnexpress.net/\")"]},{"cell_type":"code","execution_count":18,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1337,"status":"ok","timestamp":1721072037393,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"izDill-ko_OI","outputId":"f31ac0f5-6bcd-4740-d1b5-22acbf483d59"},"outputs":[{"name":"stdout","output_type":"stream","text":["Xuat hien 0 bai bao trong vnexpress\n"]}],"source":["valid_url = np.array([],np.string_)\n","for i in range(size):\n"," vnex_url = vnex_paper.article_urls()[i]\n"," # fillter out only-html link\n"," temp = ''\n"," for j in range(len(vnex_url)):\n"," temp = vnex_url[len(vnex_url) - j - 1] + temp\n"," if temp == 'htm' or temp == 'html':\n"," valid_url = np.append(valid_url,vnex_url)\n","print(f\"Xuat hien {np.shape(valid_url)[0]} bai bao trong vnexpress\")"]},{"cell_type":"code","execution_count":19,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cySL0POqqEM0","outputId":"58ab9a3a-4375-499d-da7b-7446104018a4"},"outputs":[],"source":["star = np.array(['Date', 'Content', 'Source'],np.string_)\n","for i in range(np.shape(valid_url)[0]):\n"," vnex_url = Atc(valid_url[i],language = 'vi')\n"," vnex_url.download()\n"," vnex_url.parse()\n"," row = [vnex_url.meta_data['lastmod'],vnex_url.text,vnex_url.url]\n"," star = np.append(star, row, axis = 0)"]},{"cell_type":"markdown","metadata":{},"source":["# 3. Lưu lại dữ liệu với định dạng Date,Content,Source:"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[],"source":["star = np.reshape(star,(star.size//3,3))\n","filename = 'vnex_vi.csv'"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[],"source":["with open(filename, 'a') as csvFile:\n"," csvWriter = csv.writer(csvFile, delimiter='|', quoting= csv.QUOTE_ALL)\n"," csvWriter.writerows(star)"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Empty DataFrame\n","Columns: [b'Date', b'Content', b'Source']\n","Index: []\n"]}],"source":["df = pd.read_csv(filename,delimiter='|')\n","print(df)"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyOodf1jXMKyWvsOXsFDwups","collapsed_sections":["sTj_1PDQTp6N"],"provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.18"}},"nbformat":4,"nbformat_minor":0} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"b'Date'"|"b'Content'"|"b'Source'" | ||
|
File renamed without changes.
Oops, something went wrong.