Add method for getting tweets with replies

This commit is contained in:
2025-01-30 02:12:54 +13:00
parent a81a461da2
commit 49df46ccf9
2 changed files with 129 additions and 41 deletions

View File

@ -25,7 +25,7 @@ csrf_token = os.environ["CSRF_TOKEN"]
scraper = TweetsScraper(auth_token, csrf_token) scraper = TweetsScraper(auth_token, csrf_token)
user_id = scraper.get_id_from_screen_name("pobnellion") user_id = scraper.get_id_from_handle("pobnellion")
user_tweets = scraper.get_tweets(user_id, 100) user_tweets = scraper.get_tweets(user_id, 100)
``` ```
@ -39,9 +39,20 @@ AUTH_TOKEN=<auth token>
CSRF_TOKEN=<csrf token> CSRF_TOKEN=<csrf token>
``` ```
You can find your auth and csrf tokens in twitter's cookies (F12 in your browser > storage tab > cookies) You can find your auth and csrf tokens in twitter's cookies (F12 in your browser > storage tab > cookies)
The auth token cookie is called `auth_token` and the csrf token is called `ct0` The auth token cookie is called `auth_token` and the csrf token is called `ct0`
#### Include replies
```
user_id = scraper.get_id_from_handle("@pobnellion")
user_tweets = scraper.get_tweets_and_replies(user_id, 100)
```
This is equivalent to viewing the 'replies' tab on twitter, replies show up as Conversation objects which contain a list of tweets.
The last tweet in the conversation will always be by the currently viewed user, even if there are more replies in the chain.
### Tweet object ### Tweet object
Contains the text of the tweet, along with the timestamp and some stats (like count, repost count, views, etc) Contains the text of the tweet, along with the timestamp and some stats (like count, repost count, views, etc)
@ -54,7 +65,33 @@ Contains the text of the tweet, along with the timestamp and some stats (like co
- retweets : retweet count - retweets : retweet count
- quotes : quite tweet count - quotes : quite tweet count
- date : post date - date : post date
- is_retweet: tweet is a retweet
- is_quote: tweet is a quote tweet
- user: user who sent tweet (this is useful in conversations)
Printing a tweet object results in an overview: Printing a tweet object results in an overview:
`L:52 RT:2 Q:1 R:3 V:1032 2025-01-20T01:53:57+00:00 Example tweet text` `L:52 RT:2 Q:1 R:3 V:1032 2025-01-20T01:53:57+00:00 Example tweet text`
### Conversation object
Container for a list of tweets as shown when viewing the replies tab. Does not have any other information
#### Fields
- items : list of tweets in the conversation
### User object
Twitter user
#### Fields
- id : user id
- handle : user handle (without @)
- display_name :
- description :
- join_date :
- location :
- tweets_count :
- blue_verified :
- follower_count :

View File

@ -4,7 +4,8 @@ from datetime import datetime
class Tweet: class Tweet:
def __init__(self, tweet_object): def __init__(self, tweet_object):
tweet = tweet_object['content']['itemContent']['tweet_results']['result'] tweet = tweet_object['itemContent']['tweet_results']['result']
self.id = tweet['rest_id'] self.id = tweet['rest_id']
self.views = tweet['views']['count'] if "count" in tweet["views"].keys() else 0 self.views = tweet['views']['count'] if "count" in tweet["views"].keys() else 0
self.text = tweet['legacy']['full_text'] self.text = tweet['legacy']['full_text']
@ -13,6 +14,10 @@ class Tweet:
self.retweets = tweet['legacy']['retweet_count'] self.retweets = tweet['legacy']['retweet_count']
self.quotes = tweet['legacy']['quote_count'] self.quotes = tweet['legacy']['quote_count']
self.date = datetime.strptime(tweet['legacy']['created_at'], "%a %b %d %H:%M:%S %z %Y").astimezone(pytz.utc) self.date = datetime.strptime(tweet['legacy']['created_at'], "%a %b %d %H:%M:%S %z %Y").astimezone(pytz.utc)
self.is_retweet = tweet['legacy']['retweeted']
self.is_quote = tweet['legacy']['is_quote_status']
self.user = User(tweet['core']['user_results'])
def __repr__(self): def __repr__(self):
return f"L:{self.likes} \tRT:{self.replies} \tQ:{self.quotes} \tR:{self.replies} \tV:{self.views} \t{self.text}" return f"L:{self.likes} \tRT:{self.replies} \tQ:{self.quotes} \tR:{self.replies} \tV:{self.views} \t{self.text}"
@ -24,8 +29,39 @@ class Tweet:
return datetime.now().astimezone(pytz.utc) - self.date return datetime.now().astimezone(pytz.utc) - self.date
class Conversation:
def __init__(self, converation_object):
self.items = []
for item in converation_object['content']['items']:
self.items.append(Tweet(item['item']))
def __repr__(self):
return f"Conversation - {len(self.items)} tweets"
def __str__(self):
return f"{repr(self)}{''.join([f"\n\t{str(tweet)}" for tweet in self.items])}"
class User:
def __init__(self, user_object):
self.id = user_object['result']['rest_id']
self.handle = user_object['result']['legacy']['screen_name']
self.display_name = user_object['result']['legacy']['name']
self.description = user_object['result']['legacy']['description']
self.join_date = user_object['result']['legacy']['created_at']
self.location = user_object['result']['legacy']['location']
self.tweets_count = user_object['result']['legacy']['statuses_count']
self.blue_verified = user_object['result']['is_blue_verified']
self.follower_count = user_object['result']['legacy']['normal_followers_count']
def __str__(self):
return f"{self.display_name} - @{self.handle}"
def __repr__(self):
return f"{self.display_name} - @{self.handle} (Id {self.id})"
class TweetsScraper: class TweetsScraper:
_GET_TWEETS_URL = 'https://api.x.com/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets'
_GET_ID_BY_SCREEN_NAME_URL = "https://x.com/i/api/graphql/32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName" _GET_ID_BY_SCREEN_NAME_URL = "https://x.com/i/api/graphql/32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName"
# public non-logged-in access token (same for everyone, doesn't expire) # public non-logged-in access token (same for everyone, doesn't expire)
@ -46,6 +82,7 @@ class TweetsScraper:
} }
_FEATURES_USER_TWEETS = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":false,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":false,"responsive_web_enhance_cards_enabled":false}' _FEATURES_USER_TWEETS = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":false,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":false,"responsive_web_enhance_cards_enabled":false}'
_FEATURES_USER_TWEETS_AND_REPLIES = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":true,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"responsive_web_grok_analysis_button_from_backend":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":true,"responsive_web_enhance_cards_enabled":false}'
_FIELD_TOGGLES_USER_TWEETS = '{"withArticlePlainText":false}' _FIELD_TOGGLES_USER_TWEETS = '{"withArticlePlainText":false}'
_FEATURES_SCREEN_NAME = '{"hidden_profile_subscriptions_enabled":true,"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"subscriptions_verification_info_is_identity_verified_enabled":true,"subscriptions_verification_info_verified_since_enabled":true,"highlights_tweets_tab_ui_enabled":true,"responsive_web_twitter_article_notes_tab_enabled":true,"subscriptions_feature_can_gift_premium":true,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true}' _FEATURES_SCREEN_NAME = '{"hidden_profile_subscriptions_enabled":true,"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"subscriptions_verification_info_is_identity_verified_enabled":true,"subscriptions_verification_info_verified_since_enabled":true,"highlights_tweets_tab_ui_enabled":true,"responsive_web_twitter_article_notes_tab_enabled":true,"subscriptions_feature_can_gift_premium":true,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true}'
_FIELD_TOGGLES_SCREEN_NAME = '{"withAuxiliaryUserLabels":false}' _FIELD_TOGGLES_SCREEN_NAME = '{"withAuxiliaryUserLabels":false}'
@ -112,46 +149,60 @@ class TweetsScraper:
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries'] entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
return [Tweet(entry) for entry in entries if "tweet" in entry['entryId']] return [Tweet(entry) for entry in entries if "tweet" in entry['entryId']]
def _get_tweets_internal(self, url: str, features: str, user_id: str, count: int=100) -> list[Tweet]:
tweets = []
variables = {
"userId": user_id,
"count": min(count, 100),
"includePromotedContent": True,
"withQuickPromoteEligibilityTweetFields": True,
"withVoice": True,
"withV2Timeline": True
}
last_len = 0
while len(tweets) < count:
res = self._session.get(url, params={"variables": json.dumps(variables, separators=(',', ':')), "features": features, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS})
res_json = None
try:
res_json = json.loads(zstd.decompress(res.content))
except:
res_json = json.loads(res.text)
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
for entry in entries:
if "tweet" in entry['entryId']:
tweets.append(Tweet(entry['content']))
elif "profile-conversation" in entry['entryId']:
tweets.append(Conversation(entry))
# get cursor for next page
variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value']
if len(tweets) == last_len:
break
print(f"Got {len(tweets)} tweets")
last_len = len(tweets)
return tweets
def get_tweets_and_replies(self, user_id: str, count: int=100) -> list[Tweet]:
return self._get_tweets_internal('https://x.com/i/api/graphql/zMHMBPv4hZcCNwWLf-XYuA/UserTweetsAndReplies', self._FEATURES_USER_TWEETS_AND_REPLIES, user_id, count)
def get_tweets(self, user_id: str, count: int=100) -> list[Tweet]: def get_tweets(self, user_id: str, count: int=100) -> list[Tweet]:
tweets = [] return self._get_tweets_internal('https://x.com/i/api/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets', self._FEATURES_USER_TWEETS, user_id, count)
def get_id_from_handle(self, handle: str) -> str:
if handle.startswith('@'):
handle = handle[1:]
variables = { variables = {
"userId": user_id, "screen_name": handle
"count": min(count, 100),
"includePromotedContent": True,
"withQuickPromoteEligibilityTweetFields": True,
"withVoice": True,
"withV2Timeline": True
}
last_len = 0
while len(tweets) < count:
res = self._session.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS})
res_json = None
try:
res_json = json.loads(zstd.decompress(res.content))
except:
res_json = json.loads(res.text)
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
tweets.extend([Tweet(entry) for entry in entries if "tweet" in entry['entryId']])
# get cursor for next page
variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value']
if len(tweets) == last_len:
break
print(f"Got {len(tweets)} tweets")
last_len = len(tweets)
return tweets
def get_id_from_screen_name(self, name: str) -> str:
variables = {
"screen_name": name
} }
res = self._session.get(self._GET_ID_BY_SCREEN_NAME_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_SCREEN_NAME, "fieldToggles": self._FIELD_TOGGLES_SCREEN_NAME}) res = self._session.get(self._GET_ID_BY_SCREEN_NAME_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_SCREEN_NAME, "fieldToggles": self._FIELD_TOGGLES_SCREEN_NAME})
@ -174,8 +225,8 @@ if __name__ == "__main__":
scraper = TweetsScraper(auth_token, csrf_token) scraper = TweetsScraper(auth_token, csrf_token)
user_id = scraper.get_id_from_screen_name("pobnellion") user_id = scraper.get_id_from_handle("@pobnellion")
user_tweets = scraper.get_tweets(user_id, 100) user_tweets = scraper.get_tweets_and_replies(user_id, 100)
for t in user_tweets: for t in user_tweets:
print(t) print(t)