Add method for getting tweets with replies
This commit is contained in:
39
README.md
39
README.md
@ -25,7 +25,7 @@ csrf_token = os.environ["CSRF_TOKEN"]
|
|||||||
|
|
||||||
scraper = TweetsScraper(auth_token, csrf_token)
|
scraper = TweetsScraper(auth_token, csrf_token)
|
||||||
|
|
||||||
user_id = scraper.get_id_from_screen_name("pobnellion")
|
user_id = scraper.get_id_from_handle("pobnellion")
|
||||||
user_tweets = scraper.get_tweets(user_id, 100)
|
user_tweets = scraper.get_tweets(user_id, 100)
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -39,9 +39,20 @@ AUTH_TOKEN=<auth token>
|
|||||||
CSRF_TOKEN=<csrf token>
|
CSRF_TOKEN=<csrf token>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
You can find your auth and csrf tokens in twitter's cookies (F12 in your browser > storage tab > cookies)
|
You can find your auth and csrf tokens in twitter's cookies (F12 in your browser > storage tab > cookies)
|
||||||
The auth token cookie is called `auth_token` and the csrf token is called `ct0`
|
The auth token cookie is called `auth_token` and the csrf token is called `ct0`
|
||||||
|
|
||||||
|
#### Include replies
|
||||||
|
```
|
||||||
|
user_id = scraper.get_id_from_handle("@pobnellion")
|
||||||
|
user_tweets = scraper.get_tweets_and_replies(user_id, 100)
|
||||||
|
```
|
||||||
|
|
||||||
|
This is equivalent to viewing the 'replies' tab on twitter, replies show up as Conversation objects which contain a list of tweets.
|
||||||
|
The last tweet in the conversation will always be by the currently viewed user, even if there are more replies in the chain.
|
||||||
|
|
||||||
|
|
||||||
### Tweet object
|
### Tweet object
|
||||||
Contains the text of the tweet, along with the timestamp and some stats (like count, repost count, views, etc)
|
Contains the text of the tweet, along with the timestamp and some stats (like count, repost count, views, etc)
|
||||||
|
|
||||||
@ -54,7 +65,33 @@ Contains the text of the tweet, along with the timestamp and some stats (like co
|
|||||||
- retweets : retweet count
|
- retweets : retweet count
|
||||||
- quotes : quite tweet count
|
- quotes : quite tweet count
|
||||||
- date : post date
|
- date : post date
|
||||||
|
- is_retweet: tweet is a retweet
|
||||||
|
- is_quote: tweet is a quote tweet
|
||||||
|
- user: user who sent tweet (this is useful in conversations)
|
||||||
|
|
||||||
Printing a tweet object results in an overview:
|
Printing a tweet object results in an overview:
|
||||||
|
|
||||||
`L:52 RT:2 Q:1 R:3 V:1032 2025-01-20T01:53:57+00:00 Example tweet text`
|
`L:52 RT:2 Q:1 R:3 V:1032 2025-01-20T01:53:57+00:00 Example tweet text`
|
||||||
|
|
||||||
|
### Conversation object
|
||||||
|
|
||||||
|
Container for a list of tweets as shown when viewing the replies tab. Does not have any other information
|
||||||
|
|
||||||
|
#### Fields
|
||||||
|
- items : list of tweets in the conversation
|
||||||
|
|
||||||
|
### User object
|
||||||
|
|
||||||
|
Twitter user
|
||||||
|
|
||||||
|
#### Fields
|
||||||
|
|
||||||
|
- id : user id
|
||||||
|
- handle : user handle (without @)
|
||||||
|
- display_name :
|
||||||
|
- description :
|
||||||
|
- join_date :
|
||||||
|
- location :
|
||||||
|
- tweets_count :
|
||||||
|
- blue_verified :
|
||||||
|
- follower_count :
|
||||||
71
scraper.py
71
scraper.py
@ -4,7 +4,8 @@ from datetime import datetime
|
|||||||
|
|
||||||
class Tweet:
|
class Tweet:
|
||||||
def __init__(self, tweet_object):
|
def __init__(self, tweet_object):
|
||||||
tweet = tweet_object['content']['itemContent']['tweet_results']['result']
|
tweet = tweet_object['itemContent']['tweet_results']['result']
|
||||||
|
|
||||||
self.id = tweet['rest_id']
|
self.id = tweet['rest_id']
|
||||||
self.views = tweet['views']['count'] if "count" in tweet["views"].keys() else 0
|
self.views = tweet['views']['count'] if "count" in tweet["views"].keys() else 0
|
||||||
self.text = tweet['legacy']['full_text']
|
self.text = tweet['legacy']['full_text']
|
||||||
@ -13,6 +14,10 @@ class Tweet:
|
|||||||
self.retweets = tweet['legacy']['retweet_count']
|
self.retweets = tweet['legacy']['retweet_count']
|
||||||
self.quotes = tweet['legacy']['quote_count']
|
self.quotes = tweet['legacy']['quote_count']
|
||||||
self.date = datetime.strptime(tweet['legacy']['created_at'], "%a %b %d %H:%M:%S %z %Y").astimezone(pytz.utc)
|
self.date = datetime.strptime(tweet['legacy']['created_at'], "%a %b %d %H:%M:%S %z %Y").astimezone(pytz.utc)
|
||||||
|
self.is_retweet = tweet['legacy']['retweeted']
|
||||||
|
self.is_quote = tweet['legacy']['is_quote_status']
|
||||||
|
|
||||||
|
self.user = User(tweet['core']['user_results'])
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return f"L:{self.likes} \tRT:{self.replies} \tQ:{self.quotes} \tR:{self.replies} \tV:{self.views} \t{self.text}"
|
return f"L:{self.likes} \tRT:{self.replies} \tQ:{self.quotes} \tR:{self.replies} \tV:{self.views} \t{self.text}"
|
||||||
@ -24,8 +29,39 @@ class Tweet:
|
|||||||
return datetime.now().astimezone(pytz.utc) - self.date
|
return datetime.now().astimezone(pytz.utc) - self.date
|
||||||
|
|
||||||
|
|
||||||
|
class Conversation:
|
||||||
|
def __init__(self, converation_object):
|
||||||
|
self.items = []
|
||||||
|
|
||||||
|
for item in converation_object['content']['items']:
|
||||||
|
self.items.append(Tweet(item['item']))
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"Conversation - {len(self.items)} tweets"
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{repr(self)}{''.join([f"\n\t{str(tweet)}" for tweet in self.items])}"
|
||||||
|
|
||||||
|
|
||||||
|
class User:
|
||||||
|
def __init__(self, user_object):
|
||||||
|
self.id = user_object['result']['rest_id']
|
||||||
|
self.handle = user_object['result']['legacy']['screen_name']
|
||||||
|
self.display_name = user_object['result']['legacy']['name']
|
||||||
|
self.description = user_object['result']['legacy']['description']
|
||||||
|
self.join_date = user_object['result']['legacy']['created_at']
|
||||||
|
self.location = user_object['result']['legacy']['location']
|
||||||
|
self.tweets_count = user_object['result']['legacy']['statuses_count']
|
||||||
|
self.blue_verified = user_object['result']['is_blue_verified']
|
||||||
|
self.follower_count = user_object['result']['legacy']['normal_followers_count']
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.display_name} - @{self.handle}"
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.display_name} - @{self.handle} (Id {self.id})"
|
||||||
|
|
||||||
class TweetsScraper:
|
class TweetsScraper:
|
||||||
_GET_TWEETS_URL = 'https://api.x.com/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets'
|
|
||||||
_GET_ID_BY_SCREEN_NAME_URL = "https://x.com/i/api/graphql/32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName"
|
_GET_ID_BY_SCREEN_NAME_URL = "https://x.com/i/api/graphql/32pL5BWe9WKeSK1MoPvFQQ/UserByScreenName"
|
||||||
|
|
||||||
# public non-logged-in access token (same for everyone, doesn't expire)
|
# public non-logged-in access token (same for everyone, doesn't expire)
|
||||||
@ -46,6 +82,7 @@ class TweetsScraper:
|
|||||||
}
|
}
|
||||||
|
|
||||||
_FEATURES_USER_TWEETS = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":false,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":false,"responsive_web_enhance_cards_enabled":false}'
|
_FEATURES_USER_TWEETS = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":false,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":false,"responsive_web_enhance_cards_enabled":false}'
|
||||||
|
_FEATURES_USER_TWEETS_AND_REPLIES = '{"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_timeline_navigation_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"premium_content_api_read_enabled":false,"communities_web_enable_tweet_community_results_fetch":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":false,"responsive_web_grok_analyze_post_followups_enabled":true,"responsive_web_jetfuel_frame":false,"responsive_web_grok_share_attachment_enabled":true,"articles_preview_enabled":true,"responsive_web_edit_tweet_api_enabled":true,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":true,"view_counts_everywhere_api_enabled":true,"longform_notetweets_consumption_enabled":true,"responsive_web_twitter_article_tweet_consumption_enabled":true,"tweet_awards_web_tipping_enabled":false,"responsive_web_grok_analysis_button_from_backend":false,"creator_subscriptions_quote_tweet_preview_enabled":false,"freedom_of_speech_not_reach_fetch_enabled":true,"standardized_nudges_misinfo":true,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":true,"rweb_video_timestamps_enabled":true,"longform_notetweets_rich_text_read_enabled":true,"longform_notetweets_inline_media_enabled":true,"responsive_web_grok_image_annotation_enabled":true,"responsive_web_enhance_cards_enabled":false}'
|
||||||
_FIELD_TOGGLES_USER_TWEETS = '{"withArticlePlainText":false}'
|
_FIELD_TOGGLES_USER_TWEETS = '{"withArticlePlainText":false}'
|
||||||
_FEATURES_SCREEN_NAME = '{"hidden_profile_subscriptions_enabled":true,"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"subscriptions_verification_info_is_identity_verified_enabled":true,"subscriptions_verification_info_verified_since_enabled":true,"highlights_tweets_tab_ui_enabled":true,"responsive_web_twitter_article_notes_tab_enabled":true,"subscriptions_feature_can_gift_premium":true,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true}'
|
_FEATURES_SCREEN_NAME = '{"hidden_profile_subscriptions_enabled":true,"profile_label_improvements_pcf_label_in_post_enabled":true,"rweb_tipjar_consumption_enabled":true,"responsive_web_graphql_exclude_directive_enabled":true,"verified_phone_label_enabled":false,"subscriptions_verification_info_is_identity_verified_enabled":true,"subscriptions_verification_info_verified_since_enabled":true,"highlights_tweets_tab_ui_enabled":true,"responsive_web_twitter_article_notes_tab_enabled":true,"subscriptions_feature_can_gift_premium":true,"creator_subscriptions_tweet_preview_api_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":true}'
|
||||||
_FIELD_TOGGLES_SCREEN_NAME = '{"withAuxiliaryUserLabels":false}'
|
_FIELD_TOGGLES_SCREEN_NAME = '{"withAuxiliaryUserLabels":false}'
|
||||||
@ -112,8 +149,7 @@ class TweetsScraper:
|
|||||||
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
|
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
|
||||||
return [Tweet(entry) for entry in entries if "tweet" in entry['entryId']]
|
return [Tweet(entry) for entry in entries if "tweet" in entry['entryId']]
|
||||||
|
|
||||||
|
def _get_tweets_internal(self, url: str, features: str, user_id: str, count: int=100) -> list[Tweet]:
|
||||||
def get_tweets(self, user_id: str, count: int=100) -> list[Tweet]:
|
|
||||||
tweets = []
|
tweets = []
|
||||||
variables = {
|
variables = {
|
||||||
"userId": user_id,
|
"userId": user_id,
|
||||||
@ -126,7 +162,7 @@ class TweetsScraper:
|
|||||||
last_len = 0
|
last_len = 0
|
||||||
|
|
||||||
while len(tweets) < count:
|
while len(tweets) < count:
|
||||||
res = self._session.get(self._GET_TWEETS_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_USER_TWEETS, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS})
|
res = self._session.get(url, params={"variables": json.dumps(variables, separators=(',', ':')), "features": features, "fieldToggles": self._FIELD_TOGGLES_USER_TWEETS})
|
||||||
|
|
||||||
res_json = None
|
res_json = None
|
||||||
try:
|
try:
|
||||||
@ -135,7 +171,12 @@ class TweetsScraper:
|
|||||||
res_json = json.loads(res.text)
|
res_json = json.loads(res.text)
|
||||||
|
|
||||||
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
|
entries = [i for i in res_json['data']['user']['result']['timeline_v2']['timeline']['instructions'] if i['type'] == "TimelineAddEntries"][0]['entries']
|
||||||
tweets.extend([Tweet(entry) for entry in entries if "tweet" in entry['entryId']])
|
|
||||||
|
for entry in entries:
|
||||||
|
if "tweet" in entry['entryId']:
|
||||||
|
tweets.append(Tweet(entry['content']))
|
||||||
|
elif "profile-conversation" in entry['entryId']:
|
||||||
|
tweets.append(Conversation(entry))
|
||||||
|
|
||||||
# get cursor for next page
|
# get cursor for next page
|
||||||
variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value']
|
variables['cursor'] = [entry for entry in entries if "cursor-bottom" in entry['entryId']][0]['content']['value']
|
||||||
@ -149,9 +190,19 @@ class TweetsScraper:
|
|||||||
return tweets
|
return tweets
|
||||||
|
|
||||||
|
|
||||||
def get_id_from_screen_name(self, name: str) -> str:
|
def get_tweets_and_replies(self, user_id: str, count: int=100) -> list[Tweet]:
|
||||||
|
return self._get_tweets_internal('https://x.com/i/api/graphql/zMHMBPv4hZcCNwWLf-XYuA/UserTweetsAndReplies', self._FEATURES_USER_TWEETS_AND_REPLIES, user_id, count)
|
||||||
|
|
||||||
|
def get_tweets(self, user_id: str, count: int=100) -> list[Tweet]:
|
||||||
|
return self._get_tweets_internal('https://x.com/i/api/graphql/MpOINUGH_YVb2BKjYZOPaQ/UserTweets', self._FEATURES_USER_TWEETS, user_id, count)
|
||||||
|
|
||||||
|
|
||||||
|
def get_id_from_handle(self, handle: str) -> str:
|
||||||
|
if handle.startswith('@'):
|
||||||
|
handle = handle[1:]
|
||||||
|
|
||||||
variables = {
|
variables = {
|
||||||
"screen_name": name
|
"screen_name": handle
|
||||||
}
|
}
|
||||||
|
|
||||||
res = self._session.get(self._GET_ID_BY_SCREEN_NAME_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_SCREEN_NAME, "fieldToggles": self._FIELD_TOGGLES_SCREEN_NAME})
|
res = self._session.get(self._GET_ID_BY_SCREEN_NAME_URL, params={"variables": json.dumps(variables, separators=(',', ':')), "features": self._FEATURES_SCREEN_NAME, "fieldToggles": self._FIELD_TOGGLES_SCREEN_NAME})
|
||||||
@ -174,8 +225,8 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
scraper = TweetsScraper(auth_token, csrf_token)
|
scraper = TweetsScraper(auth_token, csrf_token)
|
||||||
|
|
||||||
user_id = scraper.get_id_from_screen_name("pobnellion")
|
user_id = scraper.get_id_from_handle("@pobnellion")
|
||||||
user_tweets = scraper.get_tweets(user_id, 100)
|
user_tweets = scraper.get_tweets_and_replies(user_id, 100)
|
||||||
|
|
||||||
for t in user_tweets:
|
for t in user_tweets:
|
||||||
print(t)
|
print(t)
|
||||||
Reference in New Issue
Block a user