Skip to content

Commit

Permalink
Total refactoring
Browse files Browse the repository at this point in the history
Used guest frontend API
BREAKING CHANGE: remove tweet.HTML property
Loading more information
Minor fixes and changes
  • Loading branch information
Alexander Sheiko committed Dec 11, 2020
1 parent 1c582e1 commit edad8f6
Show file tree
Hide file tree
Showing 15 changed files with 618 additions and 487 deletions.
41 changes: 15 additions & 26 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# Twitter Scraper

Golang implementation of python library <https://github.com/bisguzar/twitter-scraper>

Twitter's API is annoying to work with, and has lots of limitations —
luckily their frontend (JavaScript) has it's own API, which I reverse-engineered.
No API rate limits. No tokens needed. No restrictions. Extremely fast.
Expand Down Expand Up @@ -32,12 +30,12 @@ func main() {
if tweet.Error != nil {
panic(tweet.Error)
}
fmt.Println(tweet.HTML)
fmt.Println(tweet.Text)
}
}
```

It appears you can ask for up to 50 tweets.
It appears you can ask for up to 50 tweets (limit ~3200 tweets).

### Search tweets by query standard operators

Expand All @@ -58,32 +56,11 @@ func main() {
if tweet.Error != nil {
panic(tweet.Error)
}
fmt.Println(tweet.HTML)
fmt.Println(tweet.Text)
}
}
```
#### With http proxy

```golang
package main

import (
"context"
"fmt"
twitterscraper "github.com/n0madic/twitter-scraper"
)

func main() {
twitterscraper.SetProxy("http://localhost:16379")
for tweet := range twitterscraper.SearchTweets(context.Background(),
"twitter scraper data -filter:retweets", 50) {
if tweet.Error != nil {
panic(tweet.Error)
}
fmt.Println(tweet.HTML)
}
}
```
The search ends if we have 50 tweets.

See [Rules and filtering](https://developer.twitter.com/en/docs/tweets/rules-and-filtering/overview/standard-operators) for build standard queries.
Expand Down Expand Up @@ -125,3 +102,15 @@ func main() {
fmt.Println(trends)
}
```

### Use http proxy

```golang
twitterscraper.SetProxy("http://localhost:3128")
```

### Load timeline with tweet replies

```golang
twitterscraper.IncludeReplies = true
```
105 changes: 105 additions & 0 deletions api.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package twitterscraper

import (
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"sync"
)

const bearerToken string = "AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"

type user struct {
Data struct {
User struct {
RestID string `json:"rest_id"`
} `json:"user"`
} `json:"data"`
}

var (
guestToken string
cacheIDs sync.Map
)

func requestAPI(req *http.Request, target interface{}) error {
if guestToken == "" {
err := GetGuestToken()
if err != nil {
return err
}
}

req.Header.Set("Authorization", "Bearer "+bearerToken)
req.Header.Set("X-Guest-Token", guestToken)

resp, err := newHTTPClient().Do(req)
if err != nil {
return err
}
defer resp.Body.Close()

return json.NewDecoder(resp.Body).Decode(target)
}

// GetGuestToken from API
func GetGuestToken() error {
req, err := http.NewRequest("POST", "https://api.twitter.com/1.1/guest/activate.json", nil)
if err != nil {
return err
}
req.Header.Set("Authorization", "Bearer "+bearerToken)

resp, err := newHTTPClient().Do(req)
if err != nil {
return err
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return fmt.Errorf("response status %s", resp.Status)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return err
}

var jsn map[string]interface{}
if err := json.Unmarshal(body, &jsn); err != nil {
return err
}
var ok bool
if guestToken, ok = jsn["guest_token"].(string); !ok {
return fmt.Errorf("guest_token not found")
}

return nil
}

// GetUserIDByScreenName from API
func GetUserIDByScreenName(screenName string) (string, error) {
id, ok := cacheIDs.Load(screenName)
if ok {
return id.(string), nil
}

var jsn user
req, err := http.NewRequest("GET", "https://api.twitter.com/graphql/4S2ihIKfF3xhp-ENxvUAfQ/UserByScreenName?variables=%7B%22screen_name%22%3A%22"+screenName+"%22%2C%22withHighlightedLabel%22%3Atrue%7D", nil)
if err != nil {
return "", err
}

err = requestAPI(req, &jsn)
if err != nil {
return "", err
}

if jsn.Data.User.RestID == "" {
return "", fmt.Errorf("rest_id not found")
}

cacheIDs.Store(screenName, jsn.Data.User.RestID)

return jsn.Data.User.RestID, nil
}
24 changes: 24 additions & 0 deletions api_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package twitterscraper

import (
"testing"
)

func TestGetGuestToken(t *testing.T) {
if err := GetGuestToken(); err != nil {
t.Errorf("getGuestToken() error = %v", err)
}
if guestToken == "" {
t.Error("Expected non-empty guestToken")
}
}

func TestGetUserIDByScreenName(t *testing.T) {
userID, err := GetUserIDByScreenName("Twitter")
if err != nil {
t.Errorf("getUserByScreenName() error = %v", err)
}
if userID == "" {
t.Error("Expected non-empty user ID")
}
}
5 changes: 1 addition & 4 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,4 @@ module github.com/n0madic/twitter-scraper

go 1.13

require (
github.com/PuerkitoBio/goquery v1.5.1
github.com/google/go-cmp v0.4.0
)
require github.com/google/go-cmp v0.5.4
14 changes: 2 additions & 12 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,14 +1,4 @@
github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE=
github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4=
github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M=
github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
96 changes: 44 additions & 52 deletions profile.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,7 @@ package twitterscraper

import (
"fmt"
"net"
"net/http"
"strconv"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
)

// Profile of twitter user.
Expand All @@ -19,12 +13,15 @@ type Profile struct {
Birthday string
FollowersCount int
FollowingCount int
FriendsCount int
IsPrivate bool
IsVerified bool
Joined *time.Time
LikesCount int
ListedCount int
Location string
Name string
PinnedTweetIDs []string
TweetsCount int
URL string
UserID string
Expand All @@ -34,66 +31,61 @@ type Profile struct {

// GetProfile return parsed user profile.
func GetProfile(username string) (Profile, error) {
url := "https://mobile.twitter.com/" + username

client := http.DefaultClient
if HTTPProxy != nil {
client = &http.Client{
Transport: &http.Transport{
Proxy: http.ProxyURL(HTTPProxy),
DialContext: (&net.Dialer{
Timeout: 10 * time.Second,
}).DialContext,
},
}
}

req, err := http.NewRequest("GET", url, nil)
userID, err := GetUserIDByScreenName(username)
if err != nil {
return Profile{}, err
}
req.Header.Set("Accept-Language", "en-US")

resp, err := client.Do(req)
if resp == nil {
req, err := newRequest("GET", "https://twitter.com/i/api/2/timeline/profile/"+userID+".json")
if err != nil {
return Profile{}, err
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
return Profile{}, fmt.Errorf("response status: %s", resp.Status)
}
q := req.URL.Query()
q.Add("count", "20")
q.Add("userId", userID)
req.URL.RawQuery = q.Encode()

doc, err := goquery.NewDocumentFromReader(resp.Body)
var timeline timeline
err = requestAPI(req, &timeline)
if err != nil {
return Profile{}, err
}

// parse join date text
screenName := doc.Find(".screen-name").First().Text()

// check is username valid
if screenName == "" {
user, found := timeline.GlobalObjects.Users[userID]
if !found {
return Profile{}, fmt.Errorf("either @%s does not exist or is private", username)
}

return Profile{
Avatar: doc.Find("td.avatar > img").First().AttrOr("src", ""),
Biography: strings.TrimSpace(doc.Find(".bio").First().Text()),
FollowersCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(3) > a > div.statnum").First().Text()),
FollowingCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(2) > a > div.statnum").First().Text()),
IsPrivate: strings.Contains(doc.Find("div.fullname > a.badge > img").First().AttrOr("src", ""), "protected"),
IsVerified: strings.Contains(doc.Find("div.fullname > a.badge > img").First().AttrOr("src", ""), "verified"),
Location: strings.TrimSpace(doc.Find(".location").First().Text()),
Name: strings.TrimSpace(doc.Find(".fullname").First().Text()),
TweetsCount: parseCount(doc.Find("table.profile-stats > tbody > tr > td:nth-child(1) > div.statnum").First().Text()),
URL: "https://twitter.com/" + screenName,
Username: screenName,
Website: strings.TrimSpace(doc.Find("div.url > div > a").First().AttrOr("data-url", "")),
}, nil
}
profile := Profile{
Avatar: user.ProfileImageURLHTTPS,
Banner: user.ProfileBannerURL,
Biography: user.Description,
FollowersCount: user.FollowersCount,
FollowingCount: user.FavouritesCount,
FriendsCount: user.FriendsCount,
IsPrivate: user.Protected,
IsVerified: user.Verified,
LikesCount: user.FavouritesCount,
ListedCount: user.ListedCount,
Location: user.Location,
Name: user.Name,
PinnedTweetIDs: user.PinnedTweetIdsStr,
TweetsCount: user.StatusesCount,
URL: "https://twitter.com/" + user.ScreenName,
UserID: user.IDStr,
Username: user.ScreenName,
}

tm, err := time.Parse(time.RubyDate, user.CreatedAt)
if err == nil {
tm = tm.UTC()
profile.Joined = &tm
}

if len(user.Entities.URL.Urls) > 0 {
profile.Website = user.Entities.URL.Urls[0].ExpandedURL
}

func parseCount(str string) (i int) {
i, _ = strconv.Atoi(strings.Replace(str, ",", "", -1))
return
return profile, nil
}
Loading

0 comments on commit edad8f6

Please sign in to comment.