tweets-extractor/tweets_extractor.py at master · mdrhmn/tweets-extractor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from pandas import DataFrame
import numpy as np
import tweepy
import os

# Using config
# from decouple import config

# Using dotenv
from dotenv import load_dotenv

load_dotenv()

# Using os/env.py
# import env

# Authentication and connection to Twitter API.
# Using config
# consumer_key = config("CONSUMER_KEY")
# consumer_secret = config("CONSUMER_SECRET")
# access_token = config("ACCESS_TOKEN")
# access_token_secret = config("ACCESS_TOKEN_SECRET")

# Using .env file
consumer_key = os.getenv("CONSUMER_KEY")
consumer_secret = os.getenv("CONSUMER_SECRET")
access_token = os.getenv("ACCESS_TOKEN")
access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)


def tweets_extractor(usernames: list):

    TWEETS_PER_QUERY = 500
    MAX_ID = -1

    # Giving the user some feed back that the script is running
    print("Tweets Extractor is starting")

    # extract tweets from timeline of targeted politicians of the major political parties
    try:

        # loop through all the users and extract tweets from their relative timelines
        for username in usernames:

            print("Downloading %s's tweets:" % username)

            # Declare API call function
            tweets = api.user_timeline(
                screen_name=username,
                # 200 is the maximum allowed count
                count=TWEETS_PER_QUERY,
                exclude_replies=False,
                include_rts=False,
                # Necessary to keep full_text
                # otherwise only the first 140 words are extracted
                tweet_mode="extended",
            )

            all_tweets = []
            all_tweets.extend(tweets)
            oldest_id = tweets[MAX_ID].id

            while True:
                # Declare API call function
                tweets = api.user_timeline(
                    screen_name=username,
                    # 200 is the maximum allowed count
                    count=TWEETS_PER_QUERY,
                    max_id=oldest_id - 1,
                    exclude_replies=True,
                    include_rts=False,
                    # Necessary to keep full_text
                    # otherwise only the first 140 words are extracted
                    tweet_mode="extended",
                )
                if len(tweets) == 0:
                    break

                oldest_id = tweets[MAX_ID].id
                all_tweets.extend(tweets)
                print(
                    "N of {0} tweets downloaded till now: {1}".format(
                        username, len(all_tweets)
                    )
                )

            # Transform the tweepy tweets into a 2D array that will populate the csv
            outtweets = [
                [
                    tweet.id_str,
                    tweet.created_at,
                    tweet.lang,
                    tweet.is_quote_status,
                    # Raw tweets
                    tweet.full_text.encode("utf-8").decode("utf-8"),
                ]
                for idx, tweet in enumerate(all_tweets)
            ]

            # df = DataFrame(outtweets, columns=[
            #                "ID", "Date Created", "Text"])

            df = DataFrame(
                outtweets,
                columns=["ID", "Date Created", "Lang", "Quote Status", "Text"],
            )

            # Remove any rows with empty strings
            df.replace(r"^\s*$", np.nan, inplace=True, regex=True)
            df.dropna(how="any", axis=0, inplace=True)
            df.to_csv("%s_tweets.csv" % username, index=False)
            print(
                "Raw number of {0}'s tweets collected: {1}".format(
                    username, len(all_tweets)
                )
            )
            print(
                "Filtered number of {0}'s tweets written to CSV: {1}\n".format(
                    username, len(df.index)
                )
            )

    except AttributeError as e:
        print(f"Error Details: {str(e)}")


tweets_extractor(usernames=[""])