Python爬取Twitter特定主题的tweets并保存到cs
2019-04-16 本文已影响0人
早睡早起吃嘛嘛香
Step 0. 去Twitter development申请开发者账号,描述一下你要做的项目,想用twitter的数据干什么blabla的,网上有教程一步一步来就可以,大概三四天就会通过。
Step 1.
爬取2020election相关主题的tweet,七天之内的数据。
定义两个函数
get_7days_tweets() 爬数据
convert_tweets_to_csv()将爬到的数据存储到csv中。
因为每次请求twitter数据最多能有100条tweets,所以需要写一个循环,用max_id来获取比这个id更老的tweets。
# Replace the values below with yours
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
ACCESS_TOKEN = ''
ACCESS_SECRET = ''
my_auth = requests_oauthlib.OAuth1(CONSUMER_KEY, CONSUMER_SECRET,ACCESS_TOKEN, ACCESS_SECRET)
def get_7days_tweets(max_id = 0):
# 30days url
# url = 'https://api.twitter.com/1.1/tweets/search/30day/tweets.json'
url = 'https://api.twitter.com/1.1/search/tweets.json'
query_data = [('q','2020Election'),('count','100')]
if max_id!=0:query_data.append(('max_id',str(max_id)))
query_url = url + '?' + '&'.join([str(t[0]) + '=' + str(t[1]) for t in query_data])
response = requests.get(query_url, auth=my_auth, stream=True)
print(query_url, response)
return response
def convert_tweets_to_csv(http_resp):
with open('tweets_test_7days.csv','a+') as file:
tweet_lists = []
for resp in http_resp.iter_lines():
statuses = json.loads(resp)['statuses']
last_id = statuses[len(statuses)-1]['id']
for status in statuses:
line = []
# 保存相关属性到csv中
# id, user_id, created_at, user_location, text
# timeusername text source location
line.append(str(status['id']))
line.append(str(status['user']['id']))
line.append(str(status['created_at']))
# line.append(status.user.screen_name.encode('utf-8'))
# line.append(status.source.encode('utf-8'))
if status['user']['location'] is not None:
line.append(status['user']['location'])
else:
line.append('None')
line.append(status['text'].replace('\n',' ').encode('utf-8'))
df = pd.DataFrame(line).T
df.to_csv(file, header=False, index=False)
tweet_lists.append(line)
return last_id
print("Starting getting tweets.")
resp = get_7days_tweets()
while True:
last_id = convert_tweets_to_csv(resp)
resp = get_7days_tweets(last_id)
# convert_tweets_to_csv(resp)