diff --git a/README.md b/README.md index 7384fbfa..134d223c 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ This is not an official API support and etc. This is just a scraper that is usin - Sign URL to make custom request to the TIkTok API - Extract metadata from the User, Hashtag and Sginel Video pages - **Save previous progress and download only new videos that weren't downloaded before**. This feature only works from the CLI and only if **download** flag is on. +- **View and manage previously downloaded posts history in the CLI** ## To Do @@ -31,8 +32,8 @@ This is not an official API support and etc. This is just a scraper that is usin - [x] Add tests - [x] Download video without the watermark - [x] Indicate in the output file(csv/json) if the video was downloaded or not -- [ ] Scrape metadata and download video from the multiple users/hashtags specified in a source(file or etc) -- [ ] Scrape users/hashtag +- [ ] Scrape metadata and download posts from different users/hashtags in batch +- [ ] Scrape users/hashtags - [ ] Web interface ## Contribution @@ -48,7 +49,7 @@ yarn test yarn build ``` -## JSON/CSV output: +## Post metadata example: ```javascript { @@ -86,8 +87,37 @@ yarn build }[] ``` +## CSV file example + ![Demo](https://i.imgur.com/6gIbBzo.png) +## View and manage previously downloaded posts history in the CLI + +You can only view this history from the CLI and only if you have used -s flag in your previous scraper executions +**-s** save download history to avoid downloading duplicate posts in the future + +To view history record: + +```sh +tiktok-scraper history +``` + +To delete single history record: + +```sh +tiktok-scraper history -r TYPE:INPUT +tiktok-scraper history -r user:tiktok +tiktok-scraper history -r hashtag:summer +tiktok-scraper history -r trend +``` + +To delete all records: + +```sh +tiktok-scraper history -r all +``` + +![History](https://i.imgur.com/VnDKh72.png) **Possible errors** - Unknown. Report them if you will receive any @@ -122,6 +152,7 @@ Commands: tiktok-scraper hashtag [id] Scrape videos from hashtag. Enter hashtag without # tiktok-scraper trend Scrape posts from current trends tiktok-scraper music [id] Scrape posts from a music id number + tiktok-scraper history View previous download history Options: --help, -h help [boolean] @@ -142,12 +173,19 @@ Options: avoiding duplicates [boolean] [default: false] --noWaterMark, -w Download video without the watermark. This option will affect the execution speed [boolean] [default: false] + --remove, -r Delete the history record by entering "TYPE:INPUT" or + "all" to clean all the history. For example: user:bob + [default: ""] Examples: tiktok-scraper user USERNAME -d -n 100 tiktok-scraper hashtag HASHTAG_NAME -d -n 100 tiktok-scraper trend -d -n 100 tiktok-scraper music MUSICID -n 100 + tiktok-scraper music MUSIC_ID -d -n 50 + tiktok-scraper history + tiktok-scraper history -r user:bob + tiktok-scraper history -r all ``` **Example 1:** @@ -222,6 +260,13 @@ ZIP path: /{CURRENT_PATH}/trend_1552945659138.zip CSV path: /{CURRENT_PATH}/tend_1552945659138.csv ``` +**Example 7:** +View previous download history + +```sh +tiktok-scraper history +``` + **To make it look better, when downloading posts the progress will be shown in terminal** ```sh diff --git a/bin/cli.js b/bin/cli.js index ce7c05c6..e2ee269a 100644 --- a/bin/cli.js +++ b/bin/cli.js @@ -4,6 +4,7 @@ /* eslint-disable prefer-destructuring */ /* eslint-disable no-param-reassign */ +const yargs = require('yargs'); const TikTokScraper = require('../build'); const CONST = require('../build/constant'); @@ -25,16 +26,26 @@ const startScraper = async argv => { if (scraper.csv) { console.log(`CSV path: ${scraper.csv}`); } + if (scraper.message) { + console.log(scraper.message); + } + if (scraper.table) { + console.table(scraper.table); + } } catch (error) { console.log(error); } }; -require('yargs') +yargs .usage('Usage: $0 [options]') .example(`$0 user USERNAME -d -n 100`) .example(`$0 trend -d -n 100`) .example(`$0 hashtag HASHTAG_NAME -d -n 100`) + .example(`$0 music MUSIC_ID -d -n 50`) + .example(`$0 history`) + .example(`$0 history -r user:bob`) + .example(`$0 history -r all`) .command('user [id]', 'Scrape videos from username. Enter only username', {}, argv => { startScraper(argv); }) @@ -47,6 +58,9 @@ require('yargs') .command('music [id]', 'Scrape videos from music id. Enter only music id', {}, argv => { startScraper(argv); }) + .command('history', 'View previous post download history', {}, argv => { + startScraper(argv); + }) .options({ help: { alias: 'h', @@ -90,6 +104,11 @@ require('yargs') default: false, describe: 'Download video without the watermark. This option will affect the execution speed', }, + remove: { + alias: ['r'], + default: '', + describe: 'Delete the history record by entering "TYPE:INPUT" or "all" to clean all the history. For example: user:bob', + }, }) .check(argv => { if (CONST.scrape.indexOf(argv._[0]) === -1) { @@ -98,9 +117,27 @@ require('yargs') if (argv.store) { if (!argv.download) { - throw new Error('--store, -s flag only works in combination with the download flag. Add -d to your command'); + throw new Error('--store, -s flag is only working in combination with the download flag. Add -d to your command'); } } + + if (argv.remove) { + if (argv.remove.indexOf(':') === -1) { + argv.remove = `${argv.remove}:`; + } + const split = argv.remove.split(':'); + const type = split[0]; + const input = split[1]; + + if (type !== 'all' && CONST.history.indexOf(type) === -1) { + throw new Error(`--remove, -r list of allowed types: ${CONST.history}`); + } + if (!input && type !== 'trend' && type !== 'all') { + throw new Error('--remove, -r to remove the specific history record you need to enter "TYPE:INPUT". For example: user:bob'); + } + } + return true; }) - .demandCommand().argv; + .demandCommand() + .help().argv; diff --git a/src/constant/index.ts b/src/constant/index.ts index cc63d540..4ae4f14c 100644 --- a/src/constant/index.ts +++ b/src/constant/index.ts @@ -1,4 +1,5 @@ export = { - scrape: ['user', 'hashtag', 'trend', 'music', 'discover_user', 'discover_hashtag', 'discover_music'], + scrape: ['user', 'hashtag', 'trend', 'music', 'discover_user', 'discover_hashtag', 'discover_music', 'history'], userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:74.0) Gecko/20100101 Firefox/74.0', + history: ['user', 'hashtag', 'trend', 'music'], }; diff --git a/src/core/TikTok.test.ts b/src/core/TikTok.test.ts index 7861d63d..298f19a4 100644 --- a/src/core/TikTok.test.ts +++ b/src/core/TikTok.test.ts @@ -13,9 +13,11 @@ describe('TikTok Scraper MODULE(promise): user(valid input data)', () => { instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 3, filetype: '', filepath: '', input: 'tiktok', + noWaterMark: false, type: 'user', userAgent: 'Custom User-Agent', proxy: '', @@ -54,6 +56,7 @@ describe('TikTok Scraper MODULE(event): user(valid input data)', () => { instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: 'tiktok', @@ -115,6 +118,7 @@ describe('TikTok Scraper MODULE(promise): user(invalid input data)', () => { const instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: '', @@ -130,6 +134,7 @@ describe('TikTok Scraper MODULE(promise): user(invalid input data)', () => { const instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: '', @@ -147,6 +152,7 @@ describe('TikTok Scraper MODULE(event): user(invalid input data)', () => { const instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: '', @@ -167,6 +173,7 @@ describe('TikTok Scraper MODULE(event): user(invalid input data)', () => { const instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: '', @@ -193,6 +200,7 @@ describe('TikTok Scraper MODULE(promise): user(save to a file)', () => { instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: 'all', filepath: '', input: 'tiktok', @@ -225,6 +233,7 @@ describe('TikTok Scraper MODULE(promise): hashtag(valid input data)', () => { instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: 'summer', @@ -257,6 +266,7 @@ describe('TikTok Scraper MODULE(promise): signUrl', () => { instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: 'https://m.tiktok.com/share/item/list?secUid=&id=355503&type=3&count=30&minCursor=0&maxCursor=0&shareUid=&lang=', @@ -288,6 +298,7 @@ describe('TikTok Scraper MODULE(promise): getHashtagInfo', () => { instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: hasthagName, @@ -338,6 +349,7 @@ describe('TikTok Scraper MODULE(promise): getUserProfileInfo', () => { instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: userName, @@ -399,6 +411,7 @@ describe('TikTok Scraper CLI: user(save progress)', () => { store_history: true, test: true, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: 'tiktok', @@ -414,12 +427,12 @@ describe('TikTok Scraper CLI: user(save progress)', () => { jest.restoreAllMocks(); }); - it('fs.readFile should be called once', async () => { - expect(fs.readFile).toHaveBeenCalledTimes(1); + it('fs.readFile should be called 2 times', async () => { + expect(fs.readFile).toHaveBeenCalledTimes(2); }); - it('fs.writeFile should be called once', async () => { - expect(fs.writeFile).toHaveBeenCalledTimes(1); + it('fs.writeFile should be called 2 times', async () => { + expect(fs.writeFile).toHaveBeenCalledTimes(2); }); it('result should contain a valid file name for the Zip file', async () => { @@ -433,6 +446,7 @@ describe('TikTok Scraper MODULE(promise): getVideoMeta', () => { instance = new TikTokScraper({ download: false, asyncDownload: 5, + asyncScraping: 5, filetype: '', filepath: '', input: 'https://www.tiktok.com/@tiktok/video/6807491984882765062', diff --git a/src/core/TikTok.ts b/src/core/TikTok.ts index d0008097..73cca27e 100644 --- a/src/core/TikTok.ts +++ b/src/core/TikTok.ts @@ -14,7 +14,7 @@ import CONST from '../constant'; import { generateSignature } from '../helpers'; -import { PostCollector, ScrapeType, TikTokConstructor, Result, ItemListData, ApiResponse, Challenge, UserData, RequestQuery, Item } from '../types'; +import { PostCollector, ScrapeType, TikTokConstructor, Result, ItemListData, ApiResponse, Challenge, UserData, RequestQuery, Item, History } from '../types'; import { Downloader } from '../core'; @@ -71,6 +71,8 @@ export class TikTokScraper extends EventEmitter { private storeValue: string = ''; + private maxCursor: number; + private test: boolean = false; private noWaterMark: boolean; @@ -128,6 +130,7 @@ export class TikTokScraper extends EventEmitter { this.idStore = ''; this.test = test; this.noWaterMark = noWaterMark; + this.maxCursor = 0; this.Downloader = new Downloader({ progress, proxy, @@ -311,7 +314,6 @@ export class TikTokScraper extends EventEmitter { */ private mainLoop(): Promise { return new Promise(resolve => { - let maxCursor = 0; const arrayLength = this.number % 30 ? Math.ceil(this.number / 30) : Math.ceil(this.number / 30) + 1; const taskArray = Array.from({ length: arrayLength }, (v, k) => k + 1); forEachLimit( @@ -322,11 +324,8 @@ export class TikTokScraper extends EventEmitter { case 'user': this.fileName = `${this.input}_${Date.now()}`; this.getUserId() - .then(query => this.submitScrapingRequest(query, maxCursor)) - .then(cursor => { - maxCursor = cursor; - cb(null); - }) + .then(query => this.submitScrapingRequest(query, this.maxCursor)) + .then(() => cb(null)) .catch(() => cb(null)); break; case 'hashtag': @@ -338,7 +337,7 @@ export class TikTokScraper extends EventEmitter { break; case 'trend': this.getTrendingFeedQuery() - .then(query => this.submitScrapingRequest(query, item === 1 ? 0 : 1)) + .then(query => this.submitScrapingRequest(query, this.maxCursor)) .then(() => cb(null)) .catch(() => cb(null)); break; @@ -363,7 +362,7 @@ export class TikTokScraper extends EventEmitter { * Submit request to the TikTok web API * Collect received metadata */ - private async submitScrapingRequest(query, item): Promise { + private async submitScrapingRequest(query, item): Promise { try { const result = await this.scrapeData(query, item); @@ -376,7 +375,7 @@ export class TikTokScraper extends EventEmitter { if (!result.body.hasMore) { throw new Error('No more posts'); } - return result.body.maxCursor; + this.maxCursor = result.body.maxCursor; } catch (error) { throw error.message; } @@ -438,7 +437,32 @@ export class TikTokScraper extends EventEmitter { * Only available from the CLI */ private async storeDownlodProgress() { + const historyType = this.scrapeType === 'trend' ? 'trend' : `${this.scrapeType}_${this.input}`; if (this.storeValue) { + let history = {} as History; + + try { + const readFromStore = (await fromCallback(cb => readFile(`${this.tmpFolder}/tiktok_history.json`, { encoding: 'utf-8' }, cb))) as string; + history = JSON.parse(readFromStore); + } catch (error) { + history[historyType] = { + type: this.scrapeType, + input: this.input, + downloaded_posts: 0, + last_change: new Date(), + file_location: `${this.tmpFolder}/${this.storeValue}.json`, + }; + } + + if (!history[historyType]) { + history[historyType] = { + type: this.scrapeType, + input: this.input, + downloaded_posts: 0, + last_change: new Date(), + file_location: `${this.tmpFolder}/${this.storeValue}.json`, + }; + } let store: string[]; try { const readFromStore = (await fromCallback(cb => readFile(`${this.tmpFolder}/${this.storeValue}.json`, { encoding: 'utf-8' }, cb))) as string; @@ -458,11 +482,25 @@ export class TikTokScraper extends EventEmitter { }); this.collector = this.collector.filter(item => !item.repeated); + history[historyType] = { + type: this.scrapeType, + input: this.input, + downloaded_posts: history[historyType].downloaded_posts + this.collector.length, + last_change: new Date(), + file_location: `${this.tmpFolder}/${this.storeValue}.json`, + }; + try { await fromCallback(cb => writeFile(`${this.tmpFolder}/${this.storeValue}.json`, JSON.stringify(store), cb)); } catch (error) { // continue regardless of error } + + try { + await fromCallback(cb => writeFile(`${this.tmpFolder}/tiktok_history.json`, JSON.stringify(history), cb)); + } catch (error) { + // continue regardless of error + } } } diff --git a/src/entry.ts b/src/entry.ts index e9118679..d4c20fb8 100644 --- a/src/entry.ts +++ b/src/entry.ts @@ -1,5 +1,10 @@ +/* eslint-disable no-throw-literal */ +/* eslint-disable no-restricted-syntax */ +import { tmpdir } from 'os'; +import { readFile, writeFile, unlink } from 'fs'; +import { fromCallback } from 'bluebird'; import { TikTokScraper } from './core'; -import { TikTokConstructor, Options, ScrapeType, Result, UserData, Challenge, PostCollector } from './types'; +import { TikTokConstructor, Options, ScrapeType, Result, UserData, Challenge, PostCollector, History, HistoryItem } from './types'; import CONST from './constant'; const INIT_OPTIONS = { @@ -90,3 +95,46 @@ export const getVideoMeta = async (input: string, options?: Options): Promise { + const store = (await fromCallback(cb => readFile(`${tmpdir()}/tiktok_history.json`, { encoding: 'utf-8' }, cb))) as string; + const historyStore: History = JSON.parse(store); + + if (options?.remove) { + const split = options.remove.split(':'); + const type = split[0]; + + if (type === 'all') { + const remove: any = []; + for (const key of Object.keys(historyStore)) { + remove.push(fromCallback(cb => unlink(historyStore[key].file_location, cb))); + } + remove.push(fromCallback(cb => unlink(`${tmpdir()}/tiktok_history.json`, cb))); + + await Promise.all(remove); + + return { message: `History was completely removed` }; + } + + const key = type !== 'trend' ? options.remove.replace(':', '_') : 'trend'; + + if (historyStore[key]) { + const historyFile = historyStore[key].file_location; + + await fromCallback(cb => unlink(historyFile, cb)); + + delete historyStore[key]; + + await fromCallback(cb => writeFile(`${tmpdir()}/tiktok_history.json`, JSON.stringify(historyStore), cb)); + + return { message: `Record ${key} was removed` }; + } + throw `Can't find record: ${key.split('_').join(' ')}`; + } + const table: HistoryItem[] = []; + for (const key of Object.keys(historyStore)) { + table.push(historyStore[key]); + } + return { table }; +}; diff --git a/src/types/Cli.ts b/src/types/Cli.ts new file mode 100644 index 00000000..f5db3bcb --- /dev/null +++ b/src/types/Cli.ts @@ -0,0 +1,13 @@ +import { ScrapeType } from '.'; + +export interface HistoryItem { + type: ScrapeType; + input: string; + downloaded_posts: number; + last_change: Date; + file_location: string; +} + +export interface History { + [key: string]: HistoryItem; +} diff --git a/src/types/TikTok.ts b/src/types/TikTok.ts index 7dbd10fa..7c6a4024 100644 --- a/src/types/TikTok.ts +++ b/src/types/TikTok.ts @@ -24,6 +24,7 @@ export interface Options { number?: number; userAgent?: string; noWaterMark?: boolean; + remove?: string; } export interface TikTokConstructor { download: boolean; diff --git a/src/types/index.ts b/src/types/index.ts index f805607a..b3213bef 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -1,3 +1,4 @@ export * from './TikTok'; export * from './Downloader'; export * from './TikTokApi'; +export * from './Cli';