from requests_oauthlib import OAuth1Session
import numpy as np
import json
import time
import datetime as dt
import pytz
import sys
import os
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import urllib.request
class SearchTweet(object):
def __init__(self):
CONSUMER_KEY = "--Input your CONSUMER_KEY--"
CONSUMER_SECRET = "--Input your CONSUMER_SECRET--"
ACCESS_TOKEN = "--Input your ACCESS_TOKEN--"
ACCESS_TOKEN_SECRET = "--Input your TOKEN_SECRET--"
self.twitter=OAuth1Session(CONSUMER_KEY,CONSUMER_SECRET,ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
self.url="https://api.twitter.com/1.1/search/tweets.json?tweet_mode=extended"
self.sum=0
self.time={"hour":None,"min":None}
self.path=""
def Setparams(self,target,date,timezone):
"""
Setting parameters for twet search
target(str):Target for the twitter search
date(str):date for search, this program performs search for 1 day(yyyy-mm-dd)
timezone(str):time zone for search, JST, EST, PST, CST are supported (summertime is not included)
In default, 100 tweets are obtained in one request
If you want to change, change "count" parameter in self.params below.
"""
searchstr=target
searchstr+=" since:"+date+"_00:00:00_"+timezone
searchstr+=" until:"+date+"_23:59:59_"+timezone
self.params = {'q':searchstr, 'count':100}
if timezone=="JST":
self.timezone=pytz.timezone('Asia/Tokyo')
elif timezone=="EST":
self.timezone=pytz.timezone('America/New_York')
elif timezone=="PST":
self.timezone=pytz.timezone("America/Los_Angeles")
elif timezone=="CST":
self.timezone=pytz.timezone("America/Winnipeg")
def CheckStoredData(self,overwrite=True):
"""
Check data stored in folder for same search setting (target and date)
if there is folder for storing data, it will be created here.
overwrite : If True, the data searched before in same setting is deleted and search it again.
If False, StoredData.json in folder is checked
If Tweet search was finished, Tweet search is quited.
If tweet search was not finished and stopped, search will be resumed
"""
searchstr=self.params["q"]
target=searchstr[0:searchstr.find("since")-1]
date=searchstr[searchstr.find("since")+6:searchstr.find("since")+16]
print(target,date)
newtag="csvfiles/"+target
if not os.path.exists(newtag):
os.mkdir(newtag)
print("Create",newtag)
newdate="csvfiles/"+target+"/"+date
if not os.path.exists(newdate):
os.mkdir(newdate)
print("Create",newdate)
path="csvfiles/"+target+"/"+date+"/"
fileList=os.listdir(path)
if overwrite:
self.mid=-1
print("clear all data in",path)
for file in fileList:
os.remove(path+file)
else:
if "StoredData.json" in fileList:
file = open(path+"StoredData.json", 'r')
pfile=json.load(file)
if pfile["time"]["hour"]==0 and pfile["time"]["min"]==0:
print("Tweet search has already FINISHED")
else:
print("start from ",pfile["time"]["hour"],":",pfile["time"]["min"])
self.mid=pfile["params"]["max_id"]
else:
self.mid=-1
def SearchTweet(self):
"""
main program, request tweets and return them as "result"
Here also check the API limitation
for 1 request
"""
req = self.twitter.get(self.url,params=self.params)
timeline = json.loads(req.text)
self.limit = req.headers['x-rate-limit-remaining'] if 'x-rate-limit-remaining' in req.headers else 0
self.reset = req.headers['X-Rate-Limit-Reset'] if 'x-rate-limit-remaining' in req.headers else 0
if int(self.limit)==0:
if self.mid==-1:
pass
else:
self.Storedata()
self.sum=sum(sum(self.total))
self.Datafiles()
self.WaitUntilReset()
result=timeline["statuses"]
return result
def WaitUntilReset(self):
"""
Wait until cancelation of API limit.
the time for cancellation is in self.reset
"""
sec=int(self.reset)-time.mktime(dt.datetime.now().timetuple())
sec=max(sec,0)
start=dt.datetime.now()+dt.timedelta(seconds=sec)
print("Now pausing... Restart at "+str(start.strftime("%H:%M:%S")))
sys.stdout.flush()
time.sleep(sec+10)
def Datafiles(self):
"""
Make/Reflesh temporal data container
"""
self.total=np.zeros((24,60))
self.nonRT=np.zeros((24,60))
self.RT=np.zeros((24,60))
self.user={}
self.RTList={}
self.langList={}
def Analyzetweet(self):
"""
Analyze tweets
total, nontRT, RT (2d-arry,24*60): count tweet nuber in each minuite
user, RTlist, langlist (dict):make dictionary
"""
timeline=self.SearchTweet()
total=np.zeros((24,60))
nonRT=np.zeros((24,60))
RT=np.zeros((24,60))
user={}
RTList={}
langList={}
for tweet in timeline:
UTC=dt.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
time=pytz.utc.localize(UTC).astimezone(self.timezone)
hour=int(dt.datetime.strftime(time,"%H"))
mint=int(dt.datetime.strftime(time,"%M"))
total[hour][mint]+=1
self.mid=tweet['id']
self.time={"hour":hour,"min":mint}
lang=tweet["user"]["lang"]
if lang not in langList.keys():
langList[lang]={"nonRT":0,"RT":0}
if "retweeted_status" in tweet.keys():
RT[hour][mint]+=1
Rtweet=tweet["retweeted_status"]
langList[lang]["RT"]+=1
if Rtweet["id"] not in RTList.keys():
RTList[Rtweet["id"]]={"count":1}
else:
RTList[Rtweet["id"]]["count"]+=1
if RTList[Rtweet["id"]]["count"]>10:
RTList[Rtweet["id"]]["user_id"]=Rtweet["user"]["id"]
RTList[Rtweet["id"]]["screen"]=Rtweet["user"]["screen_name"]
RTList[Rtweet["id"]]["name"]=Rtweet["user"]["name"]
RTList[Rtweet["id"]]["text"]=Rtweet["full_text"]
RTList[Rtweet["id"]]["time"]=Rtweet['created_at']
else:
nonRT[hour][mint]+=1
langList[lang]["nonRT"]+=1
if tweet["user"]["id"] not in user.keys():
user[tweet["user"]["id"]]={"count":1,"screen":tweet["user"]["screen_name"],"name":tweet["user"]["name"],"lang":lang}
else:
user[tweet["user"]["id"]]["count"]+=1
self.total+=total
self.nonRT+=nonRT
self.RT+=RT
for use,val in user.items():
if use not in self.user.keys():
self.user[use]={}
self.user[use]["count"]=val["count"]
self.user[use]["lang"]=val["lang"]
else:
self.user[use]["count"]+=val["count"]
if "name" not in self.user[use].keys() and self.user[use]["count"]>10:
self.user[use]["screen"]=val["screen"]
self.user[use]["name"]=val["name"]
for key,val in RTList.items():
if key not in self.RTList:
self.RTList[key]=val
else:
self.RTList[key]["count"]+=val["count"]
if "text" in val.keys() and "text" not in self.RTList[key].keys():
self.RTList[key]["user_id"]=val["user_id"]
self.RTList[key]["name"]=val["name"]
self.RTList[key]["screen"]=val["screen"]
self.RTList[key]["text"]=val["text"]
self.RTList[key]["time"]=val["time"]
for lang,val in langList.items():
if lang not in self.langList.keys():
self.langList[lang]=val
else:
self.langList[lang]["nonRT"]+=val["nonRT"]
self.langList[lang]["RT"]+=val["RT"]
def RepeatSearch(self,repeat=10000):
"""
Repeat Tweet search (Analyze tweet)
repeat (int):how many times repeat request
"""
mid=self.mid
for i in range(repeat):
self.params["max_id"]=mid
maxid=mid
self.Analyzetweet()
mid=self.mid
if mid==maxid:
print("")
print("stop")
break
if i>0:
sys.stdout.flush()
sys.stdout.write("\r{}".format(sum(sum(self.total))+self.sum))
def Storedata(self):
"""
Store analyzed data in local folder
This creates target folde / date folder in the folder pointed by self.path
Automatically stored data and new data is integrated
"""
searchstr=self.params["q"]
target=searchstr[0:searchstr.find("since")-1]
date=searchstr[searchstr.find("since")+6:searchstr.find("since")+16]
path=self.path+"/"+target+"/"+date+"/"
fileList=os.listdir(path)
typeList=["total","nonRT","RT","user","RTList","langList"]
for types in typeList:
if types=="RTList":
filename=target+date+types+".json"
if filename in fileList:
file = open(path+filename, 'r')
pfile=json.load(file)
for key,val in self.RTList.items():
if key not in pfile.keys():
pfile[key]=val
else:
pfile[key]["count"]+=val["count"]
if "name" not in pfile[key].keys() and "name" in val.keys():
pfile[key]["screen"]=val["screen"]
pfile[key]["name"]=val["name"]
pfile[key]["user_id"]=val["user_id"]
pfile[key]["text"]=val["text"]
pfile[key]["time"]=val["time"]
Listtxt=json.dumps(pfile)
folder = open(path+filename, 'w')
json.dump(pfile, folder,indent=4)
else:
Listtxt=json.dumps(self.RTList)
folder = open(path+filename, 'w')
json.dump(self.RTList, folder,indent=4)
elif types=="user":
filename=target+date+types+".json"
if filename in fileList:
file = open(path+filename, 'r')
pfile=json.load(file)
for key,val in self.user.items():
if key not in pfile.keys():
pfile[key]=val
else:
pfile[key]["count"]+=val["count"]
if "name" not in pfile[key].keys() and pfile[key]["count"]>10:
pfile[key]["screen"]=val["screen"]
pfile[key]["name"]=val["name"]
Listtxt=json.dumps(pfile)
folder = open(path+filename, 'w')
json.dump(pfile, folder,indent=4)
else:
Listtxt=json.dumps(self.user)
folder = open(path+filename, 'w')
json.dump(self.user, folder,indent=4)
elif types=="langList":
filename=target+date+types+".json"
if filename in fileList:
file = open(path+filename, 'r')
pfile=json.load(file)
for lang,val in self.langList.items():
if lang not in pfile.keys():
pfile[lang]=val
else:
pfile[lang]["nonRT"]+=val["nonRT"]
pfile[lang]["RT"]+=val["RT"]
Listtxt=json.dumps(pfile)
folder = open(path+filename, 'w')
json.dump(pfile, folder,indent=4)
else:
Listtxt=json.dumps(self.langList)
folder = open(path+filename, 'w')
json.dump(self.langList, folder,indent=4)
else:
if types=="total":
file=self.total
elif types=="nonRT":
file=self.nonRT
elif types=="RT":
file=self.RT
filename=target+date+types+".csv"
if filename in fileList:
pfile=np.loadtxt(path+filename,delimiter=",")
file+=pfile
np.savetxt(path+filename,file,delimiter=",")
DataDict={}
DataDict["params"]=self.params
DataDict["time"]=self.time
DataDict["total"]=sum(sum(self.total))
Listtxt=json.dumps(DataDict)
folder = open(path+"StoredData.json", 'w')
json.dump(DataDict, folder,indent=4)