Skip to content

Commit

Permalink
Thầy cú lừa tui
Browse files Browse the repository at this point in the history
  • Loading branch information
ToAN7 committed Jul 19, 2024
1 parent 8b19635 commit 7794063
Show file tree
Hide file tree
Showing 4 changed files with 19,368 additions and 1 deletion.
2 changes: 1 addition & 1 deletion Project/Crawler_colab.ipynb
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"cells":[{"cell_type":"markdown","metadata":{"id":"sTj_1PDQTp6N"},"source":["# 1. Tải và cài đặt một số thư viện cho crawler"]},{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":589,"status":"ok","timestamp":1721071976795,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"FWq4ewjPThIE"},"outputs":[],"source":["import numpy as np\n","import pandas as pd\n","import csv\n","import newspaper\n","from newspaper import Article as Atc\n","from newspaper import Source as Scr"]},{"cell_type":"markdown","metadata":{"id":"2BHYvnzK28rJ"},"source":["# 2. Tải tất cả đường link và lọc ra những đường link chỉ chứa html:"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":7442,"status":"ok","timestamp":1721071984236,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"j0pqu0CJTz1N","outputId":"0b3a9f81-a429-40aa-9e23-cd782b4a587f"},"outputs":[],"source":["vnex_paper = newspaper.build('https://vnexpress.net/', language = 'vi', memoize_articles = False, follow_meta_refresh = True, keep_article_html=True)"]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":22,"status":"ok","timestamp":1721071984237,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"4mNGE-3Ztd1p","outputId":"0a778869-387d-4106-9781-95ecfcf18899"},"outputs":[{"name":"stdout","output_type":"stream","text":["Co tat ca 1705 den tu https://vnexpress.net/\n"]}],"source":["size = vnex_paper.size()\n","print(f\"Co tat ca {size} den tu https://vnexpress.net/\")"]},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1337,"status":"ok","timestamp":1721072037393,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"izDill-ko_OI","outputId":"f31ac0f5-6bcd-4740-d1b5-22acbf483d59"},"outputs":[{"name":"stdout","output_type":"stream","text":["Xuat hien 848 bai bao trong vnexpress\n"]}],"source":["valid_url = np.array([],np.string_)\n","for i in range(size):\n"," vnex_url = vnex_paper.article_urls()[i]\n"," # fillter out only-html link\n"," temp = ''\n"," for j in range(len(vnex_url)):\n"," temp = vnex_url[len(vnex_url) - j - 1] + temp\n"," if temp == 'htm' or temp == 'html':\n"," valid_url = np.append(valid_url,vnex_url)\n","print(f\"Xuat hien {np.shape(valid_url)[0]} bai bao trong vnexpress\")"]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cySL0POqqEM0","outputId":"58ab9a3a-4375-499d-da7b-7446104018a4"},"outputs":[],"source":["star = np.array(['Date', 'Content', 'Source'],np.string_)\n","for i in range(np.shape(valid_url)[0]):\n"," vnex_url = Atc(valid_url[i],language = 'vi')\n"," vnex_url.download()\n"," vnex_url.parse()\n"," vnex_url_cat = vnex_url.meta_data['tt_list_folder_name']\n"," try:\n"," vnex_url_cat = vnex_url_cat.split(',')\n"," except Exception:\n"," pass\n"," else:\n"," for j in vnex_url_cat:\n"," if j == 'Giới sao':\n"," row = [vnex_url.meta_data['lastmod'],vnex_url.text,vnex_url.url]\n"," star = np.append(star, row, axis = 0)\n"," break"]},{"cell_type":"markdown","metadata":{},"source":["# 3. Lưu lại dữ liệu với định dạng Date,Content,Source:"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[],"source":["star = np.reshape(star,(star.size//3,3))\n","filename = 'vnex_vi_celeb.csv'"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["with open(filename, 'w') as csvFile:\n"," csvWriter = csv.writer(csvFile, delimiter='|', quoting= csv.QUOTE_ALL)\n"," csvWriter.writerows(star)"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" Date \\\n","0 2024-07-16T00:00:00+07:00 \n","1 2024-07-15T11:14:39+07:00 \n","2 2024-07-15T10:30:27+07:00 \n","3 2024-07-15T09:53:09+07:00 \n","4 2024-07-15T06:55:47+07:00 \n","5 2024-07-14T19:30:00+07:00 \n","6 2024-07-14T14:47:31+07:00 \n","7 2024-07-14T14:01:48+07:00 \n","8 2024-07-14T21:00:00+07:00 \n","9 2024-07-14T20:00:00+07:00 \n","10 2024-07-14T12:54:38+07:00 \n","11 2024-07-13T22:43:55+07:00 \n","12 2024-07-13T00:00:00+07:00 \n","13 2024-07-13T20:08:13+07:00 \n","14 2024-07-13T10:50:04+07:00 \n","15 2024-07-12T19:28:05+07:00 \n","16 2024-07-12T17:09:34+07:00 \n","17 2024-07-12T16:21:09+07:00 \n","18 2024-07-12T12:30:00+07:00 \n","19 2024-07-12T15:48:27+07:00 \n","\n"," Content \\\n","0 Tăng Phúc - giọng ca \"Chỉ là không cùng nhau\" ... \n","1 Theo Daily Mail ngày 14/7, nhiều nghệ sĩ có mặ... \n","2 Cô từng dự thi Miss Universe Thái Lan 2022, và... \n","3 RomaniaWiz Khalifa - rapper nổi tiếng với hit ... \n","4 Diễn viên Shannen Doherty - đóng phim \"Phép th... \n","5 Hình ảnh cô catwalk trong cuộc thi. Kenza Layl... \n","6 Pax Thiên - con trai gốc Việt của minh tinh An... \n","7 Hoàng Tử Thao - ca sĩ, diễn viên được cho là... \n","8 Ca sĩ Mỹ Selena Gomez hẹn hò Benny Blanco - bạ... \n","9 Anh 30 tuổi, tốt nghiệp ngành thanh nhạc nhưng... \n","10 Tuấn Ngọc - người mẫu quen mặt ở làng thời tra... \n","11 Người mẫu Phạm Tuấn Ngọc, 25 tuổi, thắng danh ... \n","12 Gia đình bốn người của diễn viên Bùi Bài Bình ... \n","13 Á hậu Ngọc Thảo tạo dáng với đầm cắt xẻ.\\r\\n\\r... \n","14 MỹDiễn viên Alec Baldwin, 66 tuổi, khóc khi th... \n","15 Nhà sản xuất Miss Universe Vietnam cho rằng Ho... \n","16 Taylor Swift - Travis Kelce được cho chi nhiều... \n","17 Bà nội Jane Pitt bế bé Knox hai tuổi trên ban ... \n","18 Quyền Linh bên hai con gái. Nghệ sĩ tên thật l... \n","19 Quán quân The New Mentor 2023 Lê Thu Trang mặc... \n","\n"," Source \n","0 https://vnexpress.net/tang-phuc-toi-tung-stres... \n","1 https://vnexpress.net/dan-sao-xem-chung-ket-eu... \n","2 https://vnexpress.net/nhan-sac-hoa-hau-hoan-vu... \n","3 https://vnexpress.net/rapper-wiz-khalifa-bi-ba... \n","4 https://vnexpress.net/sao-phep-thuat-qua-doi-4... \n","5 https://vnexpress.net/nhan-sac-hoa-hau-ai-dau-... \n","6 https://vnexpress.net/pax-thien-moi-me-an-toi-... \n","7 https://vnexpress.net/than-tuong-giau-nhat-tru... \n","8 https://vnexpress.net/chuyen-tinh-song-gio-cua... \n","9 https://vnexpress.net/ngoai-hinh-hut-fan-cua-d... \n","10 https://vnexpress.net/sac-voc-tan-mr-world-vie... \n","11 https://vnexpress.net/pham-tuan-ngoc-dang-quan... \n","12 https://vnexpress.net/nghe-si-bui-bai-binh-son... \n","13 https://vnexpress.net/hoa-hau-a-hau-du-chung-k... \n","14 https://vnexpress.net/tai-tu-alec-baldwin-bat-... \n","15 https://vnexpress.net/nsx-miss-universe-vietna... \n","16 https://vnexpress.net/taylor-swift-va-ban-trai... \n","17 https://vnexpress.net/chan-dung-con-trai-ut-cu... \n","18 https://vnexpress.net/net-dep-tuoi-16-cua-con-... \n","19 https://vnexpress.net/thanh-hang-dien-goi-cam-... \n"]}],"source":["df = pd.read_csv(filename,delimiter='|')\n","print(df)"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyOodf1jXMKyWvsOXsFDwups","collapsed_sections":["sTj_1PDQTp6N"],"provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.18"}},"nbformat":4,"nbformat_minor":0}
{"cells":[{"cell_type":"markdown","metadata":{"id":"sTj_1PDQTp6N"},"source":["# 1. Tải và cài đặt một số thư viện cho crawler"]},{"cell_type":"code","execution_count":15,"metadata":{"executionInfo":{"elapsed":589,"status":"ok","timestamp":1721071976795,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"FWq4ewjPThIE"},"outputs":[],"source":["import numpy as np\n","import pandas as pd\n","import csv\n","import newspaper\n","from newspaper import Article as Atc\n","from newspaper import Source as Scr"]},{"cell_type":"markdown","metadata":{"id":"2BHYvnzK28rJ"},"source":["# 2. Tải tất cả đường link và lọc ra những đường link chỉ chứa html:"]},{"cell_type":"code","execution_count":16,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":7442,"status":"ok","timestamp":1721071984236,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"j0pqu0CJTz1N","outputId":"0b3a9f81-a429-40aa-9e23-cd782b4a587f"},"outputs":[],"source":["vnex_paper = newspaper.build('https://vnexpress.net/', language = 'vi', follow_meta_refresh = True, keep_article_html=True)"]},{"cell_type":"code","execution_count":17,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":22,"status":"ok","timestamp":1721071984237,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"4mNGE-3Ztd1p","outputId":"0a778869-387d-4106-9781-95ecfcf18899"},"outputs":[{"name":"stdout","output_type":"stream","text":["Co tat ca 0 den tu https://vnexpress.net/\n"]}],"source":["size = vnex_paper.size()\n","print(f\"Co tat ca {size} den tu https://vnexpress.net/\")"]},{"cell_type":"code","execution_count":18,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1337,"status":"ok","timestamp":1721072037393,"user":{"displayName":"27-Đức Toàn","userId":"05824633883755290128"},"user_tz":-420},"id":"izDill-ko_OI","outputId":"f31ac0f5-6bcd-4740-d1b5-22acbf483d59"},"outputs":[{"name":"stdout","output_type":"stream","text":["Xuat hien 0 bai bao trong vnexpress\n"]}],"source":["valid_url = np.array([],np.string_)\n","for i in range(size):\n"," vnex_url = vnex_paper.article_urls()[i]\n"," # fillter out only-html link\n"," temp = ''\n"," for j in range(len(vnex_url)):\n"," temp = vnex_url[len(vnex_url) - j - 1] + temp\n"," if temp == 'htm' or temp == 'html':\n"," valid_url = np.append(valid_url,vnex_url)\n","print(f\"Xuat hien {np.shape(valid_url)[0]} bai bao trong vnexpress\")"]},{"cell_type":"code","execution_count":19,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cySL0POqqEM0","outputId":"58ab9a3a-4375-499d-da7b-7446104018a4"},"outputs":[],"source":["star = np.array(['Date', 'Content', 'Source'],np.string_)\n","for i in range(np.shape(valid_url)[0]):\n"," vnex_url = Atc(valid_url[i],language = 'vi')\n"," vnex_url.download()\n"," vnex_url.parse()\n"," row = [vnex_url.meta_data['lastmod'],vnex_url.text,vnex_url.url]\n"," star = np.append(star, row, axis = 0)"]},{"cell_type":"markdown","metadata":{},"source":["# 3. Lưu lại dữ liệu với định dạng Date,Content,Source:"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[],"source":["star = np.reshape(star,(star.size//3,3))\n","filename = 'vnex_vi.csv'"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[],"source":["with open(filename, 'a') as csvFile:\n"," csvWriter = csv.writer(csvFile, delimiter='|', quoting= csv.QUOTE_ALL)\n"," csvWriter.writerows(star)"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Empty DataFrame\n","Columns: [b'Date', b'Content', b'Source']\n","Index: []\n"]}],"source":["df = pd.read_csv(filename,delimiter='|')\n","print(df)"]}],"metadata":{"colab":{"authorship_tag":"ABX9TyOodf1jXMKyWvsOXsFDwups","collapsed_sections":["sTj_1PDQTp6N"],"provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.18"}},"nbformat":4,"nbformat_minor":0}
1 change: 1 addition & 0 deletions Project/vnex_vi.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"b'Date'"|"b'Content'"|"b'Source'"
Expand Down
File renamed without changes.
Loading

0 comments on commit 7794063

Please sign in to comment.