diff --git a/.gitignore b/.gitignore index f7275bb..1ee8ead 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ venv/ +.env diff --git a/README.md b/README.md index 6e361ba..7a7b56b 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,60 @@ ## Twitter scraper Scrape user's tweets :D -### Usage: +## Usage: -`tweets = TweetsScraper().get_tweets_anonymous("")` +### Unauthenticated +Example: +``` +scraper = TweetScraper() +tweets = scraper.get_tweets_anonymous("") +``` -returns a list of tweets from the user as viewed from a logged-out session. Will only return 100 tweets (not necessarily the most recent) +This will only allow use of the anonymous user tweets method, other methods will fail. -`tweets = TweetsScraper().get_tweets("")` +The anonymous method returns a list of tweets from the user as viewed from a logged-out session. It will only return 100 tweets (not necessarily the most recent) -not implemented yet, will get tweets as a logged in user + +### Authenticated +Example: +``` +dotenv.load_dotenv() + +auth_token = os.environ["AUTH_TOKEN"] +csrf_token = os.environ["CSRF_TOKEN"] + +scraper = TweetsScraper(auth_token, csrf_token) + +user_id = scraper.get_id_from_screen_name("pobnellion") +user_tweets = scraper.get_tweets(user_id, 100) +``` + +Allows you to get tweets as a logged in user. Twitter only makes the 2000 ish most recent tweets available, but that should be more than enough. + +You can either directly pass in the user id to `get_tweets()`, or use `get_id_from_screen_name()` to get the id if you don't have it. + +To use dotenv, include a `.env` file in the directory with the following contents (no quotes around the values): +``` +AUTH_TOKEN= +CSRF_TOKEN= +``` + +You can find your auth and csrf tokens in twitter's cookies (F12 in your browser > storage tab > cookies) +The auth token cookie is called `auth_token` and the csrf token is called `ct0` ### Tweet object Contains the text of the tweet, along with the timestamp and some stats (like count, repost count, views, etc) + +#### Fields: +- id : tweet id +- views : view count +- text : tweet content +- likes : like count +- replies : reply count +- retweets : retweet count +- quotes : quite tweet count +- date : post date + +Printing a tweet object results in an overview: + +`L:52 RT:2 Q:1 R:3 V:1032 2025-01-20T01:53:57+00:00 Example tweet text` diff --git a/requirements.txt b/requirements.txt index fe7bac4..3ddf6e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pytz==2024.2 requests==2.32.3 zstd==1.5.6.1 +python-dotenv==1.0.1 diff --git a/scraper.py b/scraper.py index 2eecc7b..0690e41 100644 --- a/scraper.py +++ b/scraper.py @@ -1,132 +1,8 @@ -import requests, re, json, pytz, zstd +import requests, re, json, pytz, zstd, dotenv, os from datetime import datetime -class TweetsScraper: - _GET_TWEETS_URL = 'https://api.x.com/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets' - # public non-logged-in access token (same for everyone, doesn't expire) - _AUTHORIZATION_TOKEN = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' - - _HEADERS = { - "Host": "api.x.com", - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0", - "Accept": "*/*", - "Accept-Language": "en-US,en;q=0.5", - "Accept-Encoding": "gzip, deflate, br, zstd", - "content-type": "application/json", - "authorization": f"Bearer {_AUTHORIZATION_TOKEN}", - "x-twitter-client-language": "en", - "x-twitter-active-user": "yes", - "Origin": "https://x.com", - "Sec-GPC": "1", - "Connection": "keep-alive", - "Referer": "https://x.com/", - "Sec-Fetch-Dest": "empty", - "Sec-Fetch-Mode": "cors", - "Sec-Fetch-Site": "same-site", - "TE": "trailers", - } - _FEATURES_USER_TWEETS = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":false,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":false,"responsive_web_enhance_cards_enabled":false}' - _FIELD_TOGGLES_USER_TWEETS = '{"withArticlePlainText":false}' - - def __init__(self): - self._session = requests.Session() - - def _get_guest_token(self): - # Get guest token from x.com request - if "x-guest-token" in self._HEADERS.keys(): - return - - # Different headers necessary so we dont get a 400 response - headers = { - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Encoding": "gzip, deflate, br, zstd", - "Accept-Language": "en-US,en;q=0.5", - "Cache-Control": "no-cache", - "Connection": "keep-alive", - "Host": "x.com", - "Pragma": "no-cache", - "Priority": "u=0, i", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1", - "Sec-GPC": "1", - "Upgrade-Insecure-Requests": "1", - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", - "x-client-transaction-id": "78k0T6XnCiJK2f5fZ5RwmyeKmiOHk8HFfWovTa6JQF4DRfIkyjpARHxQzi0pWxKPtzks0ezFLICv4xuxmIyokH1EBEWe7A", - "x-guest-token": "1881289785240932544" - } - - res = self._session.get("https://x.com/?mx=2", headers=headers) - - # find the guest token in the response - self._HEADERS["x-guest-token"] = res.text.split("gt=")[1].split(";")[0] - - def get_tweets_anonymous(self, user): - self._get_guest_token() - - variables = { - "userId": user, - "count": 100, - "includePromotedContent": True, - "withQuickPromoteEligibilityTweetFields": True, - "withVoice": True, - "withV2Timeline": True - } - - res = self._session.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS}, headers=self._HEADERS) - - res_json = None - try: - res_json = json.loads(zstd.decompress(res.content)) - except: - res_json = json.loads(res.text) - - entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries'] - return [Tweet(entry) for entry in entries if "tweet" in entry['entryId']] - - def get_tweets(self, user, count=100): - return - # self._get_guest_token() - # user_id = self._get_user_by_screen_name(user) - - - tweets = [] - variables = { - "userId": user, - "count": min(count, 100), - "includePromotedContent": True, - "withQuickPromoteEligibilityTweetFields": True, - "withVoice": True, - "withV2Timeline": True - } - last_len = 0 - - while len(tweets) < count: - res = requests.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS}, headers=self._HEADERS) - - res_json = None - try: - res_json = json.loads(zstd.decompress(res.content)) - except: - res_json = json.loads(res.text) - - entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries'] - tweets.extend([Tweet(entry) for entry in entries if "tweet" in entry['entryId']]) - # variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value'] - - break - - if len(tweets) == last_len: - break - - print(f"Got {len(tweets)} tweets") - last_len = len(tweets) - - return tweets - -class Tweet(): +class Tweet: def __init__(self, tweet_object): tweet = tweet_object['content']['itemContent']['tweet_results']['result'] self.id = tweet['rest_id'] @@ -148,8 +24,158 @@ class Tweet(): return datetime.now().astimezone(pytz.utc) - self.date +class TweetsScraper: + _GET_TWEETS_URL = 'https://api.x.com/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets' + _GET_ID_BY_SCREEN_NAME_URL = "https://x.com/i/api/graphql/32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName" + + # public non-logged-in access token (same for everyone, doesn't expire) + _PUBLIC_AUTHORIZATION_TOKEN = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA' + + _COMMON_HEADERS = { + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "en-US,en;q=0.5", + "authorization": f"Bearer {_PUBLIC_AUTHORIZATION_TOKEN}", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "content-type": "application/json", + "Pragma": "no-cache", + "Sec-GPC": "1", + "TE": "trailers", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", + } + + _FEATURES_USER_TWEETS = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":false,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":false,"responsive_web_enhance_cards_enabled":false}' + _FIELD_TOGGLES_USER_TWEETS = '{"withArticlePlainText":false}' + _FEATURES_SCREEN_NAME = '{"hidden_profile_subscriptions_enabled":true,"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"subscriptions_verification_info_is_identity_verified_enabled":true,"subscriptions_verification_info_verified_since_enabled":true,"highlights_tweets_tab_ui_enabled":true,"responsive_web_twitter_article_notes_tab_enabled":true,"subscriptions_feature_can_gift_premium":true,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true}' + _FIELD_TOGGLES_SCREEN_NAME = '{"withAuxiliaryUserLabels":false}' + + + def __init__(self, auth_token: str = None, csrf_token: str = None): + self._session = requests.Session() + self._session.headers.update(self._COMMON_HEADERS) + + if auth_token is not None and csrf_token is not None: + self._session.cookies.set("auth_token", auth_token) + self._session.cookies.set("ct0", csrf_token) + self._session.headers.update({ "x-csrf-token": csrf_token }) + + + def _get_guest_token(self): + # Get guest token from x.com request + if "x-guest-token" in self._session.headers.keys(): + return + + # Different headers necessary so we dont get a 400 response + headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Accept-Language": "en-US,en;q=0.5", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Pragma": "no-cache", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Sec-GPC": "1", + "Upgrade-Insecure-Requests": "1", + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", + } + + res = requests.get("https://x.com/?mx=2", headers=headers) + + # find the guest token in the response + self._session.headers.update({ "x-guest-token": res.text.split("gt=")[1].split(";")[0] }) + + + def get_tweets_anonymous(self, user_id: str) -> list[Tweet]: + self._get_guest_token() + + variables = { + "userId": user_id, + "count": 100, + "includePromotedContent": True, + "withQuickPromoteEligibilityTweetFields": True, + "withVoice": True, + "withV2Timeline": True + } + + res = self._session.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS}) + + res_json = None + try: + res_json = json.loads(zstd.decompress(res.content)) + except: + res_json = json.loads(res.text) + + entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries'] + return [Tweet(entry) for entry in entries if "tweet" in entry['entryId']] + + + def get_tweets(self, user_id: str, count: int=100) -> list[Tweet]: + tweets = [] + variables = { + "userId": user_id, + "count": min(count, 100), + "includePromotedContent": True, + "withQuickPromoteEligibilityTweetFields": True, + "withVoice": True, + "withV2Timeline": True + } + last_len = 0 + + while len(tweets) < count: + res = self._session.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS}) + + res_json = None + try: + res_json = json.loads(zstd.decompress(res.content)) + except: + res_json = json.loads(res.text) + + entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries'] + tweets.extend([Tweet(entry) for entry in entries if "tweet" in entry['entryId']]) + + # get cursor for next page + variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value'] + + if len(tweets) == last_len: + break + + print(f"Got {len(tweets)} tweets") + last_len = len(tweets) + + return tweets + + + def get_id_from_screen_name(self, name: str) -> str: + variables = { + "screen_name": name + } + + res = self._session.get(self._GET_ID_BY_SCREEN_NAME_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_SCREEN_NAME, "fieldToggles": self._FIELD_TOGGLES_SCREEN_NAME}) + + res_json = None + try: + res_json = json.loads(zstd.decompress(res.content)) + except: + res_json = json.loads(res.text) + + return res_json['data']['user']['result']['rest_id'] + + + if __name__ == "__main__": - user_tweets = TweetsScraper().get_tweets_anonymous("1279948441968246785") # pobnellion + dotenv.load_dotenv() + + auth_token = os.environ["AUTH_TOKEN"] + csrf_token = os.environ["CSRF_TOKEN"] + + scraper = TweetsScraper(auth_token, csrf_token) + + user_id = scraper.get_id_from_screen_name("pobnellion") + user_tweets = scraper.get_tweets(user_id, 100) for t in user_tweets: print(t) \ No newline at end of file