Get authenticated scraping working

This commit is contained in:
2025-01-26 00:19:53 +13:00
parent 278d0a56c4
commit a81a461da2
4 changed files with 205 additions and 132 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
venv/ venv/
.env

View File

@ -1,15 +1,60 @@
## Twitter scraper ## Twitter scraper
Scrape user's tweets :D Scrape user's tweets :D
### Usage: ## Usage:
`tweets = TweetsScraper().get_tweets_anonymous("<user_id>")` ### Unauthenticated
Example:
```
scraper = TweetScraper()
tweets = scraper.get_tweets_anonymous("<user_id>")
```
returns a list of tweets from the user as viewed from a logged-out session. Will only return 100 tweets (not necessarily the most recent) This will only allow use of the anonymous user tweets method, other methods will fail.
`tweets = TweetsScraper().get_tweets("<user_id>")` The anonymous method returns a list of tweets from the user as viewed from a logged-out session. It will only return 100 tweets (not necessarily the most recent)
not implemented yet, will get tweets as a logged in user
### Authenticated
Example:
```
dotenv.load_dotenv()
auth_token = os.environ["AUTH_TOKEN"]
csrf_token = os.environ["CSRF_TOKEN"]
scraper = TweetsScraper(auth_token, csrf_token)
user_id = scraper.get_id_from_screen_name("pobnellion")
user_tweets = scraper.get_tweets(user_id, 100)
```
Allows you to get tweets as a logged in user. Twitter only makes the 2000 ish most recent tweets available, but that should be more than enough.
You can either directly pass in the user id to `get_tweets()`, or use `get_id_from_screen_name()` to get the id if you don't have it.
To use dotenv, include a `.env` file in the directory with the following contents (no quotes around the values):
```
AUTH_TOKEN=<auth token>
CSRF_TOKEN=<csrf token>
```
You can find your auth and csrf tokens in twitter's cookies (F12 in your browser > storage tab > cookies)
The auth token cookie is called `auth_token` and the csrf token is called `ct0`
### Tweet object ### Tweet object
Contains the text of the tweet, along with the timestamp and some stats (like count, repost count, views, etc) Contains the text of the tweet, along with the timestamp and some stats (like count, repost count, views, etc)
#### Fields:
- id : tweet id
- views : view count
- text : tweet content
- likes : like count
- replies : reply count
- retweets : retweet count
- quotes : quite tweet count
- date : post date
Printing a tweet object results in an overview:
`L:52 RT:2 Q:1 R:3 V:1032 2025-01-20T01:53:57+00:00 Example tweet text`

View File

@ -1,3 +1,4 @@
pytz==2024.2 pytz==2024.2
requests==2.32.3 requests==2.32.3
zstd==1.5.6.1 zstd==1.5.6.1
python-dotenv==1.0.1

View File

@ -1,132 +1,8 @@
import requests, re, json, pytz, zstd import requests, re, json, pytz, zstd, dotenv, os
from datetime import datetime from datetime import datetime
class TweetsScraper:
_GET_TWEETS_URL = 'https://api.x.com/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets'
# public non-logged-in access token (same for everyone, doesn't expire) class Tweet:
_AUTHORIZATION_TOKEN = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
_HEADERS = {
"Host": "api.x.com",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br, zstd",
"content-type": "application/json",
"authorization": f"Bearer {_AUTHORIZATION_TOKEN}",
"x-twitter-client-language": "en",
"x-twitter-active-user": "yes",
"Origin": "https://x.com",
"Sec-GPC": "1",
"Connection": "keep-alive",
"Referer": "https://x.com/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
"TE": "trailers",
}
_FEATURES_USER_TWEETS = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":false,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":false,"responsive_web_enhance_cards_enabled":false}'
_FIELD_TOGGLES_USER_TWEETS = '{"withArticlePlainText":false}'
def __init__(self):
self._session = requests.Session()
def _get_guest_token(self):
# Get guest token from x.com request
if "x-guest-token" in self._HEADERS.keys():
return
# Different headers necessary so we dont get a 400 response
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "x.com",
"Pragma": "no-cache",
"Priority": "u=0, i",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",
"x-client-transaction-id": "78k0T6XnCiJK2f5fZ5RwmyeKmiOHk8HFfWovTa6JQF4DRfIkyjpARHxQzi0pWxKPtzks0ezFLICv4xuxmIyokH1EBEWe7A",
"x-guest-token": "1881289785240932544"
}
res = self._session.get("https://x.com/?mx=2", headers=headers)
# find the guest token in the response
self._HEADERS["x-guest-token"] = res.text.split("gt=")[1].split(";")[0]
def get_tweets_anonymous(self, user):
self._get_guest_token()
variables = {
"userId": user,
"count": 100,
"includePromotedContent": True,
"withQuickPromoteEligibilityTweetFields": True,
"withVoice": True,
"withV2Timeline": True
}
res = self._session.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS}, headers=self._HEADERS)
res_json = None
try:
res_json = json.loads(zstd.decompress(res.content))
except:
res_json = json.loads(res.text)
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
return [Tweet(entry) for entry in entries if "tweet" in entry['entryId']]
def get_tweets(self, user, count=100):
return
# self._get_guest_token()
# user_id = self._get_user_by_screen_name(user)
tweets = []
variables = {
"userId": user,
"count": min(count, 100),
"includePromotedContent": True,
"withQuickPromoteEligibilityTweetFields": True,
"withVoice": True,
"withV2Timeline": True
}
last_len = 0
while len(tweets) < count:
res = requests.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS}, headers=self._HEADERS)
res_json = None
try:
res_json = json.loads(zstd.decompress(res.content))
except:
res_json = json.loads(res.text)
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
tweets.extend([Tweet(entry) for entry in entries if "tweet" in entry['entryId']])
# variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value']
break
if len(tweets) == last_len:
break
print(f"Got {len(tweets)} tweets")
last_len = len(tweets)
return tweets
class Tweet():
def __init__(self, tweet_object): def __init__(self, tweet_object):
tweet = tweet_object['content']['itemContent']['tweet_results']['result'] tweet = tweet_object['content']['itemContent']['tweet_results']['result']
self.id = tweet['rest_id'] self.id = tweet['rest_id']
@ -148,8 +24,158 @@ class Tweet():
return datetime.now().astimezone(pytz.utc) - self.date return datetime.now().astimezone(pytz.utc) - self.date
class TweetsScraper:
_GET_TWEETS_URL = 'https://api.x.com/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets'
_GET_ID_BY_SCREEN_NAME_URL = "https://x.com/i/api/graphql/32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName"
# public non-logged-in access token (same for everyone, doesn't expire)
_PUBLIC_AUTHORIZATION_TOKEN = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
_COMMON_HEADERS = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "en-US,en;q=0.5",
"authorization": f"Bearer {_PUBLIC_AUTHORIZATION_TOKEN}",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"content-type": "application/json",
"Pragma": "no-cache",
"Sec-GPC": "1",
"TE": "trailers",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",
}
_FEATURES_USER_TWEETS = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":false,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":false,"responsive_web_enhance_cards_enabled":false}'
_FIELD_TOGGLES_USER_TWEETS = '{"withArticlePlainText":false}'
_FEATURES_SCREEN_NAME = '{"hidden_profile_subscriptions_enabled":true,"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"subscriptions_verification_info_is_identity_verified_enabled":true,"subscriptions_verification_info_verified_since_enabled":true,"highlights_tweets_tab_ui_enabled":true,"responsive_web_twitter_article_notes_tab_enabled":true,"subscriptions_feature_can_gift_premium":true,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true}'
_FIELD_TOGGLES_SCREEN_NAME = '{"withAuxiliaryUserLabels":false}'
def __init__(self, auth_token: str = None, csrf_token: str = None):
self._session = requests.Session()
self._session.headers.update(self._COMMON_HEADERS)
if auth_token is not None and csrf_token is not None:
self._session.cookies.set("auth_token", auth_token)
self._session.cookies.set("ct0", csrf_token)
self._session.headers.update({ "x-csrf-token": csrf_token })
def _get_guest_token(self):
# Get guest token from x.com request
if "x-guest-token" in self._session.headers.keys():
return
# Different headers necessary so we dont get a 400 response
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",
}
res = requests.get("https://x.com/?mx=2", headers=headers)
# find the guest token in the response
self._session.headers.update({ "x-guest-token": res.text.split("gt=")[1].split(";")[0] })
def get_tweets_anonymous(self, user_id: str) -> list[Tweet]:
self._get_guest_token()
variables = {
"userId": user_id,
"count": 100,
"includePromotedContent": True,
"withQuickPromoteEligibilityTweetFields": True,
"withVoice": True,
"withV2Timeline": True
}
res = self._session.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS})
res_json = None
try:
res_json = json.loads(zstd.decompress(res.content))
except:
res_json = json.loads(res.text)
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
return [Tweet(entry) for entry in entries if "tweet" in entry['entryId']]
def get_tweets(self, user_id: str, count: int=100) -> list[Tweet]:
tweets = []
variables = {
"userId": user_id,
"count": min(count, 100),
"includePromotedContent": True,
"withQuickPromoteEligibilityTweetFields": True,
"withVoice": True,
"withV2Timeline": True
}
last_len = 0
while len(tweets) < count:
res = self._session.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS})
res_json = None
try:
res_json = json.loads(zstd.decompress(res.content))
except:
res_json = json.loads(res.text)
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
tweets.extend([Tweet(entry) for entry in entries if "tweet" in entry['entryId']])
# get cursor for next page
variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value']
if len(tweets) == last_len:
break
print(f"Got {len(tweets)} tweets")
last_len = len(tweets)
return tweets
def get_id_from_screen_name(self, name: str) -> str:
variables = {
"screen_name": name
}
res = self._session.get(self._GET_ID_BY_SCREEN_NAME_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_SCREEN_NAME, "fieldToggles": self._FIELD_TOGGLES_SCREEN_NAME})
res_json = None
try:
res_json = json.loads(zstd.decompress(res.content))
except:
res_json = json.loads(res.text)
return res_json['data']['user']['result']['rest_id']
if __name__ == "__main__": if __name__ == "__main__":
user_tweets = TweetsScraper().get_tweets_anonymous("1279948441968246785") # pobnellion dotenv.load_dotenv()
auth_token = os.environ["AUTH_TOKEN"]
csrf_token = os.environ["CSRF_TOKEN"]
scraper = TweetsScraper(auth_token, csrf_token)
user_id = scraper.get_id_from_screen_name("pobnellion")
user_tweets = scraper.get_tweets(user_id, 100)
for t in user_tweets: for t in user_tweets:
print(t) print(t)