The code suppose to pull results from api, split it by day, and store each day’s results as distinct json file. Why it doesn't create json files?
twint_loop splits the date range into a series of days and calls twint_search to do the searching for each date. Each json is named after the date and stored in a directory based on the search term, using clean_name to ensure that it is a valid directory name.
from datetime import timedelta
from string import ascii_letters, digits
from os import mkdir, path
import pandas as pd
import twint
def clean_name(dirname):
valid = set(ascii_letters + digits)
return ''.join(a for a in dirname if a in valid)
def twint_search(searchterm, since, until, json_name):
'''
Twint search for a specific date range.
Stores results to json.
'''
c = twint.Config()
c.Search = searchterm
c.Since = since
c.Until = until
c.Hide_output = True
c.Store_json = True
c.Output = json_name
c.Debug = True
try:
twint.run.Search(c)
except (KeyboardInterrupt, SystemExit):
raise
except:
print("Problem with %s." % since)
def twint_loop(searchterm, since, until):
dirname = clean_name(searchterm)
try:
# Create target Directory
mkdir(dirname)
print("Directory", dirname, "Created ")
except FileExistsError:
print("Directory", dirname, "already exists")
daterange = pd.date_range(since, until)
for start_date in daterange:
since = start_date.strftime("%Y-%m-%d")
until = (start_date + timedelta(days=1)).strftime("%Y-%m-%d")
# timeframe splitting days
json_name = '%s.json' % since
json_name = path.join(dirname, json_name)
print('Getting %s ' % since)
twint_search(searchterm, since, until, json_name)
twint_loop('#Microbiology', '06-01-2021', '07-01-2021') # my keywords, time
from glob import glob
file_names = glob(path.join('hodl','*.json'))
dfs = [pd.read_json(fn, lines = True) for fn in file_names]
mic_df = pd.concat(dfs)
mic_df.info()