the-skating-shift/app.py at main · melonrush13/the-skating-shift · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import tweepy
import csv
import pandas as pd
import snscrape.modules.twitter as sntwitter

from secrets import consumer_key, consumer_secret

# get tweets with tweepy from the last 7 days
def get_tweets_tweepy(keyword, date):

    # auth to granting access to API
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    # wait_on_rate_limit is required b/c twitter has set a limit
    api = tweepy.API(auth, wait_on_rate_limit=True)

    scrapedTweets = []
    # iterate through tweets using api search and save them to array
    for tweet in tweepy.Cursor(api.search, until=date, q=keyword).items():
        data = [tweet.created_at, tweet.id, tweet.geo, tweet.text, tweet.user._json['screen_name']]
        data = tuple(data)

        # We only want data that doesn't have retweets
        if (not tweet.retweeted) and ('RT @' not in tweet.text):
            scrapedTweets.append(data)

    # export tweets to dataframe
    df = pd.DataFrame(scrapedTweets, columns = ['datetime', 'tweet_id', 'geo','text', 'username'])
    filename = keyword+'_no_rt_scrape'+'.csv'
    df.to_csv(filename, index=False)

# get tweets with snscrape
def get_tweets_sn():
    snScrapedTweets = []
    # iterate through tweets using TwitterSearchScrapper and save them to array
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper('rollerblades since:2020-01-01 until:2020-12-01').get_items()):
        if i>100000:
            break
        snScrapedTweets.append([tweet.date, tweet.id, tweet.content, tweet.username])

    # export tweets to dataframe
    df = pd.DataFrame(snScrapedTweets, columns = ['datetime', 'tweet_id', 'text', 'username'])
    df.to_csv('2020_rollerbladessnscrape_tweets.csv', index=False)

#get tweets using Tweepy
#get_tweets_tweepy("rollerskating", '2020-01-01')

# get tweets using sn
get_tweets_sn()