diff --git a/README.md b/README.md index 7a7b56b..59d198c 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ csrf_token = os.environ["CSRF_TOKEN"] scraper = TweetsScraper(auth_token, csrf_token) -user_id = scraper.get_id_from_screen_name("pobnellion") +user_id = scraper.get_id_from_handle("pobnellion") user_tweets = scraper.get_tweets(user_id, 100) ``` @@ -39,9 +39,20 @@ AUTH_TOKEN= CSRF_TOKEN= ``` + You can find your auth and csrf tokens in twitter's cookies (F12 in your browser > storage tab > cookies) The auth token cookie is called `auth_token` and the csrf token is called `ct0` +#### Include replies +``` +user_id = scraper.get_id_from_handle("@pobnellion") +user_tweets = scraper.get_tweets_and_replies(user_id, 100) +``` + +This is equivalent to viewing the 'replies' tab on twitter, replies show up as Conversation objects which contain a list of tweets. +The last tweet in the conversation will always be by the currently viewed user, even if there are more replies in the chain. + + ### Tweet object Contains the text of the tweet, along with the timestamp and some stats (like count, repost count, views, etc) @@ -54,7 +65,33 @@ Contains the text of the tweet, along with the timestamp and some stats (like co - retweets : retweet count - quotes : quite tweet count - date : post date +- is_retweet: tweet is a retweet +- is_quote: tweet is a quote tweet +- user: user who sent tweet (this is useful in conversations) Printing a tweet object results in an overview: `L:52 RT:2 Q:1 R:3 V:1032 2025-01-20T01:53:57+00:00 Example tweet text` + +### Conversation object + +Container for a list of tweets as shown when viewing the replies tab. Does not have any other information + +#### Fields +- items : list of tweets in the conversation + +### User object + +Twitter user + +#### Fields + +- id : user id +- handle : user handle (without @) +- display_name : +- description : +- join_date : +- location : +- tweets_count : +- blue_verified : +- follower_count : \ No newline at end of file diff --git a/scraper.py b/scraper.py index 0690e41..4bc18bd 100644 --- a/scraper.py +++ b/scraper.py @@ -4,7 +4,8 @@ from datetime import datetime class Tweet: def __init__(self, tweet_object): - tweet = tweet_object['content']['itemContent']['tweet_results']['result'] + tweet = tweet_object['itemContent']['tweet_results']['result'] + self.id = tweet['rest_id'] self.views = tweet['views']['count'] if "count" in tweet["views"].keys() else 0 self.text = tweet['legacy']['full_text'] @@ -13,6 +14,10 @@ class Tweet: self.retweets = tweet['legacy']['retweet_count'] self.quotes = tweet['legacy']['quote_count'] self.date = datetime.strptime(tweet['legacy']['created_at'], "%a %b %d %H:%M:%S %z %Y").astimezone(pytz.utc) + self.is_retweet = tweet['legacy']['retweeted'] + self.is_quote = tweet['legacy']['is_quote_status'] + + self.user = User(tweet['core']['user_results']) def __repr__(self): return f"L:{self.likes} \tRT:{self.replies} \tQ:{self.quotes} \tR:{self.replies} \tV:{self.views} \t{self.text}" @@ -24,8 +29,39 @@ class Tweet: return datetime.now().astimezone(pytz.utc) - self.date +class Conversation: + def __init__(self, converation_object): + self.items = [] + + for item in converation_object['content']['items']: + self.items.append(Tweet(item['item'])) + + def __repr__(self): + return f"Conversation - {len(self.items)} tweets" + + def __str__(self): + return f"{repr(self)}{''.join([f"\n\t{str(tweet)}" for tweet in self.items])}" + + +class User: + def __init__(self, user_object): + self.id = user_object['result']['rest_id'] + self.handle = user_object['result']['legacy']['screen_name'] + self.display_name = user_object['result']['legacy']['name'] + self.description = user_object['result']['legacy']['description'] + self.join_date = user_object['result']['legacy']['created_at'] + self.location = user_object['result']['legacy']['location'] + self.tweets_count = user_object['result']['legacy']['statuses_count'] + self.blue_verified = user_object['result']['is_blue_verified'] + self.follower_count = user_object['result']['legacy']['normal_followers_count'] + + def __str__(self): + return f"{self.display_name} - @{self.handle}" + + def __repr__(self): + return f"{self.display_name} - @{self.handle} (Id {self.id})" + class TweetsScraper: - _GET_TWEETS_URL = 'https://api.x.com/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets' _GET_ID_BY_SCREEN_NAME_URL = "https://x.com/i/api/graphql/32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName" # public non-logged-in access token (same for everyone, doesn't expire) @@ -46,6 +82,7 @@ class TweetsScraper: } _FEATURES_USER_TWEETS = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":false,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":false,"responsive_web_enhance_cards_enabled":false}' + _FEATURES_USER_TWEETS_AND_REPLIES = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":true,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"responsive_web_grok_analysis_button_from_backend":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":true,"responsive_web_enhance_cards_enabled":false}' _FIELD_TOGGLES_USER_TWEETS = '{"withArticlePlainText":false}' _FEATURES_SCREEN_NAME = '{"hidden_profile_subscriptions_enabled":true,"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"subscriptions_verification_info_is_identity_verified_enabled":true,"subscriptions_verification_info_verified_since_enabled":true,"highlights_tweets_tab_ui_enabled":true,"responsive_web_twitter_article_notes_tab_enabled":true,"subscriptions_feature_can_gift_premium":true,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true}' _FIELD_TOGGLES_SCREEN_NAME = '{"withAuxiliaryUserLabels":false}' @@ -112,46 +149,60 @@ class TweetsScraper: entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries'] return [Tweet(entry) for entry in entries if "tweet" in entry['entryId']] + def _get_tweets_internal(self, url: str, features: str, user_id: str, count: int=100) -> list[Tweet]: + tweets = [] + variables = { + "userId": user_id, + "count": min(count, 100), + "includePromotedContent": True, + "withQuickPromoteEligibilityTweetFields": True, + "withVoice": True, + "withV2Timeline": True + } + last_len = 0 + + while len(tweets) < count: + res = self._session.get(url, params={"variables": json.dumps(variables, separators=(',', ':')), "features": features, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS}) + + res_json = None + try: + res_json = json.loads(zstd.decompress(res.content)) + except: + res_json = json.loads(res.text) + + entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries'] + + for entry in entries: + if "tweet" in entry['entryId']: + tweets.append(Tweet(entry['content'])) + elif "profile-conversation" in entry['entryId']: + tweets.append(Conversation(entry)) + + # get cursor for next page + variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value'] + + if len(tweets) == last_len: + break + + print(f"Got {len(tweets)} tweets") + last_len = len(tweets) + + return tweets + + + def get_tweets_and_replies(self, user_id: str, count: int=100) -> list[Tweet]: + return self._get_tweets_internal('https://x.com/i/api/graphql/zMHMBPv4hZcCNwWLf-XYuA/UserTweetsAndReplies', self._FEATURES_USER_TWEETS_AND_REPLIES, user_id, count) def get_tweets(self, user_id: str, count: int=100) -> list[Tweet]: - tweets = [] - variables = { - "userId": user_id, - "count": min(count, 100), - "includePromotedContent": True, - "withQuickPromoteEligibilityTweetFields": True, - "withVoice": True, - "withV2Timeline": True - } - last_len = 0 - - while len(tweets) < count: - res = self._session.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS}) - - res_json = None - try: - res_json = json.loads(zstd.decompress(res.content)) - except: - res_json = json.loads(res.text) - - entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries'] - tweets.extend([Tweet(entry) for entry in entries if "tweet" in entry['entryId']]) - - # get cursor for next page - variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value'] - - if len(tweets) == last_len: - break - - print(f"Got {len(tweets)} tweets") - last_len = len(tweets) - - return tweets - + return self._get_tweets_internal('https://x.com/i/api/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets', self._FEATURES_USER_TWEETS, user_id, count) + - def get_id_from_screen_name(self, name: str) -> str: + def get_id_from_handle(self, handle: str) -> str: + if handle.startswith('@'): + handle = handle[1:] + variables = { - "screen_name": name + "screen_name": handle } res = self._session.get(self._GET_ID_BY_SCREEN_NAME_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_SCREEN_NAME, "fieldToggles": self._FIELD_TOGGLES_SCREEN_NAME}) @@ -174,8 +225,8 @@ if __name__ == "__main__": scraper = TweetsScraper(auth_token, csrf_token) - user_id = scraper.get_id_from_screen_name("pobnellion") - user_tweets = scraper.get_tweets(user_id, 100) + user_id = scraper.get_id_from_handle("@pobnellion") + user_tweets = scraper.get_tweets_and_replies(user_id, 100) for t in user_tweets: print(t) \ No newline at end of file