我正在编写一个 Python 脚本,用于从国家数字预报数据库 (NDFD) 服务器下载数据。FTP服务器文件的组织方式是:Year/YearMonth/YearMonthDay,我必须从每天的文件夹中下载一个文件,然后上文件夹并下载第二天等等。我当前的代码非常慢,下载一天的数据大约需要 20 秒,这意味着一年需要 2 小时。我希望它更快。请在下面找到我的代码。
from ftplib import FTP
import ftplib
import os
import datetime as dt
import pandas as pd
import time
def ndfd_download(keyword, days_, forecast_hour):
# search for the files between 30 minutes on either side
# of the forecast hour -40 is to convert 100 to 60 minutes
time_start = int(float(forecast_hour)*100 - 30 - 40)
time_end = int(float(forecast_hour)*100 + 30)
print('Starting connection to NOAA database')
# Try connecting to the NCDC server
try:
ftp = FTP('nomads.ncdc.noaa.gov')
ftp.login()
print('Connect successful')
except ftplib.all_errors as e:
errorcode_string = str(e).split(None, 1)[0]
print(errorcode_string)
ftp.cwd('/NDFD/')
print('Current working directory is %s' % ftp.pwd())
# go through all the days
for day_ in days_:
start = time.time()
# get year, month, day information from day_
year = "{:02d}".format(day_.year)
year_month = "{:02d}".format(day_.year) + "{:02d}".format(day_.month)
year_month_day = "{:02d}".format(day_.year) + "{:02d}".format(day_.month) + "{:02d}".format(day_.day)
try:
# Change to the desired NDFD directory to get your data
# print('Changing directory to \"/NDFD/{}/{}/\"'.format(month, day))
ftp.cwd('/NDFD/{}/{}/'.format(year_month, year_month_day))
# getting names of all files in the current working directory
all_files = ftp.nlst()
# filtering all the files with desired keyword
all_files = [key for key in (all_files) if key.startswith(keyword)]
# creating a directory to store the data
directoryName = '{}/{}/{}'.format(year, year_month, year_month_day)
if not os.path.exists(directoryName):
os.makedirs(directoryName)
# Move into the folder
directoryPath = '%s/%s' % (os.getcwd(), directoryName)
os.chdir(directoryPath)
print('Downloading data for {}'.format(year_month_day))
# go through all the files in the directory
for f in all_files:
# get the last 4 characters of file name
# they contain the time of forecast
file_time = float(f[-4:])
# check if time of forecast is within our bounds
if (file_time <= time_end and file_time >= time_start):
# open a new file
file = open(f, 'wb')
try:
# save the file with the same name
ftp.retrbinary('RETR %s' % f, file.write)
# print('Successfully downloaded: {}'.format(f))
except ftplib.all_errors as e:
errorcode_string = str(e).split(None, 1)[0]
print('Error', errorcode_string)
file.close()
# going 3 directories up
os.chdir("../../..")
except ftplib.error_perm as e:
errorcode_string = str(e).split(None, 1)[0]
print('Error', e)
print(time.time() - start)
ftp.close()
if __name__ == "__main__":
keyword = "YAUZ98"
years = [2018]
for year in years:
month = 1
day = 30
days_ = []
# no_of_days = 366 if calendar.isleap(year) else 365
no_of_days = 100
t = dt.datetime(year,month,day)
for i in range(no_of_days):
days_.append((t))
t = t + dt.timedelta(days = 1)
forecast_hour = '14'
ndfd_download(keyword, days_, forecast_hour)