プロレス統計

「プロレスの数字とプロレスする」をテーマにプロレスに関連する数字を調べ、まとめ、考えるブログです。

MENU

ツイート集計用プログラム(Python用)

from requests_oauthlib import OAuth1Session
import numpy as np
import json
import time
import datetime as dt
import pytz
import sys
import os
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import urllib.request

class SearchTweet(object):
    
    def __init__(self):
    	CONSUMER_KEY = "--Input your CONSUMER_KEY--"
		CONSUMER_SECRET = "--Input your CONSUMER_SECRET--"
		ACCESS_TOKEN = "--Input your ACCESS_TOKEN--"
		ACCESS_TOKEN_SECRET = "--Input your TOKEN_SECRET--"
        self.twitter=OAuth1Session(CONSUMER_KEY,CONSUMER_SECRET,ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
        self.url="https://api.twitter.com/1.1/search/tweets.json?tweet_mode=extended"   
        self.sum=0
        self.time={"hour":None,"min":None}
        self.path="" #"Input path for data store, else datafile is stored local"
    
    def Setparams(self,target,date,timezone):
    	"""
    	Setting parameters for twet search
    	target(str):Target for the twitter search
    	date(str):date for search, this program performs search for 1 day(yyyy-mm-dd)
		timezone(str):time zone for search, JST, EST, PST, CST are supported (summertime is not included)
    	In default, 100 tweets are obtained in one request
    	If you want to change, change "count" parameter in self.params below.
    	"""
        searchstr=target
        searchstr+=" since:"+date+"_00:00:00_"+timezone
        searchstr+=" until:"+date+"_23:59:59_"+timezone

        self.params = {'q':searchstr, 'count':100}
        if timezone=="JST":
            self.timezone=pytz.timezone('Asia/Tokyo')
        elif timezone=="EST":
            self.timezone=pytz.timezone('America/New_York')
        elif timezone=="PST":
            self.timezone=pytz.timezone("America/Los_Angeles")   
        elif timezone=="CST":
            self.timezone=pytz.timezone("America/Winnipeg")   
             
    
    def CheckStoredData(self,overwrite=True):
    	"""
    	Check data stored in folder for same search setting (target and date)
    	if there is folder for storing data, it will be created here.
    	overwrite : If True, the data searched before in same setting is deleted and search it again.
    				If False, StoredData.json in folder is checked
    					If Tweet search was finished, Tweet search is quited.
    					If tweet search was not finished and stopped, search will be resumed
    	"""
        searchstr=self.params["q"]
        target=searchstr[0:searchstr.find("since")-1]
        date=searchstr[searchstr.find("since")+6:searchstr.find("since")+16]
        print(target,date)
        
        newtag="csvfiles/"+target
        if not os.path.exists(newtag): #make tag folder
            os.mkdir(newtag)
            print("Create",newtag)
        newdate="csvfiles/"+target+"/"+date
        if not os.path.exists(newdate): #make date folder
            os.mkdir(newdate)
            print("Create",newdate)
            
        path="csvfiles/"+target+"/"+date+"/"
        fileList=os.listdir(path)
        if overwrite:
            self.mid=-1
            print("clear all data in",path)
            for file in fileList:
                os.remove(path+file)
        else:
            if "StoredData.json" in fileList:
                file = open(path+"StoredData.json", 'r')
                pfile=json.load(file)
                if pfile["time"]["hour"]==0 and pfile["time"]["min"]==0:
                    print("Tweet search has already FINISHED")
                else:
                    print("start from ",pfile["time"]["hour"],":",pfile["time"]["min"])
                self.mid=pfile["params"]["max_id"]
                
            else:
                self.mid=-1

    def SearchTweet(self):
    	"""
    	main program, request tweets and return them as "result"
    	Here also check the API limitation
    	for 1 request
    	"""
        req = self.twitter.get(self.url,params=self.params)
        
        timeline = json.loads(req.text)
        self.limit = req.headers['x-rate-limit-remaining'] if 'x-rate-limit-remaining' in req.headers else 0
        self.reset = req.headers['X-Rate-Limit-Reset'] if 'x-rate-limit-remaining' in req.headers else 0

        if int(self.limit)==0:
            if self.mid==-1:
                pass
            else:   
                self.Storedata() 
                self.sum=sum(sum(self.total))
            self.Datafiles() 
            self.WaitUntilReset() 
        
        result=timeline["statuses"]
        return result

    def WaitUntilReset(self):
    	"""
    	Wait until cancelation of API limit.
    	the time for cancellation is in self.reset
    	"""
        sec=int(self.reset)-time.mktime(dt.datetime.now().timetuple())
        sec=max(sec,0)
        start=dt.datetime.now()+dt.timedelta(seconds=sec)
        print("Now pausing... Restart at "+str(start.strftime("%H:%M:%S")))
        sys.stdout.flush()
        time.sleep(sec+10)

    def Datafiles(self):
    	"""
    	Make/Reflesh temporal data container
    	"""
        self.total=np.zeros((24,60))
        self.nonRT=np.zeros((24,60))
        self.RT=np.zeros((24,60))
        self.user={}
        self.RTList={}
        self.langList={}

    def Analyzetweet(self):
    	"""
    	Analyze tweets
    	total, nontRT, RT (2d-arry,24*60): count tweet nuber in each minuite
    	user, RTlist, langlist (dict):make dictionary  
    	"""
        timeline=self.SearchTweet()
    
        total=np.zeros((24,60))
        nonRT=np.zeros((24,60))
        RT=np.zeros((24,60))
        user={}
        RTList={}
        langList={}
        for tweet in timeline:
            UTC=dt.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
            time=pytz.utc.localize(UTC).astimezone(self.timezone)
            hour=int(dt.datetime.strftime(time,"%H"))
            mint=int(dt.datetime.strftime(time,"%M"))

            total[hour][mint]+=1
            self.mid=tweet['id']
            self.time={"hour":hour,"min":mint}
            lang=tweet["user"]["lang"]
            if lang not in langList.keys():
                langList[lang]={"nonRT":0,"RT":0}
                
            if "retweeted_status" in tweet.keys():
                RT[hour][mint]+=1
                Rtweet=tweet["retweeted_status"]
                langList[lang]["RT"]+=1
                if Rtweet["id"] not in RTList.keys():
                    RTList[Rtweet["id"]]={"count":1}
                else:
                    RTList[Rtweet["id"]]["count"]+=1
                    if RTList[Rtweet["id"]]["count"]>10:
                        RTList[Rtweet["id"]]["user_id"]=Rtweet["user"]["id"]
                        RTList[Rtweet["id"]]["screen"]=Rtweet["user"]["screen_name"]
                        RTList[Rtweet["id"]]["name"]=Rtweet["user"]["name"]
                        RTList[Rtweet["id"]]["text"]=Rtweet["full_text"]
                        RTList[Rtweet["id"]]["time"]=Rtweet['created_at']
            else:
                nonRT[hour][mint]+=1
                langList[lang]["nonRT"]+=1
                      
                if tweet["user"]["id"] not in user.keys():
                    user[tweet["user"]["id"]]={"count":1,"screen":tweet["user"]["screen_name"],"name":tweet["user"]["name"],"lang":lang}
                else:
                    user[tweet["user"]["id"]]["count"]+=1
                
        self.total+=total
        self.nonRT+=nonRT
        self.RT+=RT
        for use,val in user.items():
            if use not in self.user.keys():
                self.user[use]={}
                self.user[use]["count"]=val["count"]
                self.user[use]["lang"]=val["lang"]
            else:
                self.user[use]["count"]+=val["count"]
                if "name" not in self.user[use].keys() and self.user[use]["count"]>10:
                    self.user[use]["screen"]=val["screen"]
                    self.user[use]["name"]=val["name"]
        for key,val in RTList.items():
            if key not in self.RTList:
                self.RTList[key]=val
            else:
                self.RTList[key]["count"]+=val["count"]
                if "text" in val.keys() and "text" not in self.RTList[key].keys():
                    self.RTList[key]["user_id"]=val["user_id"]
                    self.RTList[key]["name"]=val["name"]
                    self.RTList[key]["screen"]=val["screen"]
                    self.RTList[key]["text"]=val["text"]
                    self.RTList[key]["time"]=val["time"]
        for lang,val in langList.items():
            if lang not in self.langList.keys():
                self.langList[lang]=val
            else:
                self.langList[lang]["nonRT"]+=val["nonRT"]
                self.langList[lang]["RT"]+=val["RT"]

    def RepeatSearch(self,repeat=10000):
        """
        Repeat Tweet search (Analyze tweet)
        repeat (int):how many times repeat request
        """
        mid=self.mid
        for i in range(repeat):
            self.params["max_id"]=mid
            maxid=mid
            self.Analyzetweet()
            mid=self.mid
            if mid==maxid:
                print("")
                print("stop")
                break  
            if i>0:
                sys.stdout.flush()
            sys.stdout.write("\r{}".format(sum(sum(self.total))+self.sum))
            
    def Storedata(self):
        """
        Store analyzed data in local folder
		This creates target folde / date folder in the folder pointed by self.path
		Automatically stored data and new data is integrated
        """
        searchstr=self.params["q"]
        target=searchstr[0:searchstr.find("since")-1]
        date=searchstr[searchstr.find("since")+6:searchstr.find("since")+16]
        path=self.path+"/"+target+"/"+date+"/"
        
        fileList=os.listdir(path)
        typeList=["total","nonRT","RT","user","RTList","langList"]
        
        for types in typeList:
            if types=="RTList":
                filename=target+date+types+".json"
                if filename in fileList:
                    file = open(path+filename, 'r')
                    pfile=json.load(file)
                    
                    for key,val in self.RTList.items():
                        if key not in pfile.keys():
                            pfile[key]=val
                        else:
                            pfile[key]["count"]+=val["count"]
                            if "name" not in pfile[key].keys() and "name" in val.keys():
                                pfile[key]["screen"]=val["screen"]
                                pfile[key]["name"]=val["name"]
                                pfile[key]["user_id"]=val["user_id"]
                                pfile[key]["text"]=val["text"]
                                pfile[key]["time"]=val["time"]
        
                    Listtxt=json.dumps(pfile)
                    folder = open(path+filename, 'w')
                    json.dump(pfile, folder,indent=4)
                else:
                    Listtxt=json.dumps(self.RTList)
                    folder = open(path+filename, 'w')
                    json.dump(self.RTList, folder,indent=4)                

            elif types=="user":
                filename=target+date+types+".json"
                if filename in fileList:
                    file = open(path+filename, 'r')
                    pfile=json.load(file)
                    for key,val in self.user.items():
                        if key not in pfile.keys():
                            pfile[key]=val
                        else:
                            pfile[key]["count"]+=val["count"]
                            if "name" not in pfile[key].keys() and pfile[key]["count"]>10:
                                pfile[key]["screen"]=val["screen"]
                                pfile[key]["name"]=val["name"]
                    Listtxt=json.dumps(pfile)
                    folder = open(path+filename, 'w')
                    json.dump(pfile, folder,indent=4)
                else:
                    Listtxt=json.dumps(self.user)
                    folder = open(path+filename, 'w')
                    json.dump(self.user, folder,indent=4)
            elif types=="langList":
                filename=target+date+types+".json"
                if filename in fileList:
                    file = open(path+filename, 'r')
                    pfile=json.load(file)
                    
                    for lang,val in self.langList.items():
                        if lang not in pfile.keys():
                            pfile[lang]=val
                        else:
                            pfile[lang]["nonRT"]+=val["nonRT"]
                            pfile[lang]["RT"]+=val["RT"]
                    Listtxt=json.dumps(pfile)
                    folder = open(path+filename, 'w')
                    json.dump(pfile, folder,indent=4)
                else:
                    Listtxt=json.dumps(self.langList)
                    folder = open(path+filename, 'w')
                    json.dump(self.langList, folder,indent=4)
            
            else:
                if types=="total":
                    file=self.total
                elif types=="nonRT":
                    file=self.nonRT
                elif types=="RT":
                    file=self.RT
                    
                filename=target+date+types+".csv"
                if filename in fileList:
                    pfile=np.loadtxt(path+filename,delimiter=",")
                    file+=pfile            
                np.savetxt(path+filename,file,delimiter=",")
        
        DataDict={}
        DataDict["params"]=self.params
        DataDict["time"]=self.time
        DataDict["total"]=sum(sum(self.total))
                
        Listtxt=json.dumps(DataDict)
        folder = open(path+"StoredData.json", 'w')
        json.dump(DataDict, folder,indent=4)