0

我完成了抓取该网站的代码:https ://publicrecordsaccess.fultoncountyga.gov/Portal/Home/Dashboard/29 为了访问该网页,您需要在搜索部分输入“White, Jasmine”并手动绕过 reCAPTCHA。这是我的代码

#Empty List to store the elements
name =[]
address = []
dob = []
race = []
gender = []
defendant = []
date = []
ftype = []
status = []
case_id = []

web = 'https://publicrecordsaccess.fultoncountyga.gov/Portal/Home/Dashboard/29'
keys = 'White, Jasmine'

nb = 1
b = "CaseResultsGrid_"

op = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=op, executable_path="/usr/local/bin/chromedriver")
actions = ActionChains(driver)
driver.get(web)

driver.find_element_by_xpath('//*[@id="caseCriteria_SearchCriteria"]').send_keys(keys)

#Wait til bypass the reCAPTCHA and webpage fully loaded to continue
input("Press Enter to continue...")

content = driver.find_elements_by_css_selector(".k-detail-cell .party-card")

for c in content: 
    #The loop will stop/break when it cant locate any web elements
    try:
        tr = c.find_element_by_xpath(f'//*[@id="Grid"]/table/tbody/tr[{nb}]/td[2]/a')
    except:
        break
    
    #Get Grid ID
    actions.move_to_element(c).perform()
    time.sleep(0.2)
    g = c.get_attribute('id')

    #Get Data ID
    pat = c.find_element_by_xpath(f'//*[@id="Grid"]/table/tbody/tr[{nb}]/td[2]/a')
    id_nub = pat.get_attribute('data-party-id')
    k = b + id_nub

    #Name
    try:
        cname = c.find_element_by_xpath(f'//*[@id="Grid"]/table/tbody/tr[{nb}]/td[2]/a').text.strip()
        name.append(cname)          
    except:
        name.append('')
    #dob
    try:
        born = c.find_element_by_xpath(f'//*[@id="Grid"]/table/tbody/tr[{nb}]/td[3]').text.strip()
        dob.append(born)          
    except:
        dob.append('')
    #Address
    try:
        addr = c.find_element_by_xpath(f'//*[@id="{g}"]/div[1]/div[1]/div/div[2]/div').text.strip()
        address.append(addr)
    except:
        address.append('')
    #Race
    try:
        rac = c.find_element_by_xpath(f'//*[@id="{g}"]/div[1]/div[2]/div/div[2]').text.strip()
        race.append(rac)
    except:
        race.append('')
    #Gender
    try:
        gen = c.find_element_by_xpath(f'//*[@id="{g}"]/div[1]/div[3]/div/div[2]/div').text.strip()
        gender.append(gen)
    except:
        gender.append('')
    #Case Number
    try:
        case = c.find_element_by_xpath(f'//*[@id="{k}"]/table/tbody/tr[1]/td[2]/a').text.strip()
        case_id.append(case)
    except:
        case_id.append('')
    #Defendant
    try:
        defe = c.find_element_by_xpath(f'//*[@id="{k}"]/table/tbody/tr[1]/td[3]/div').text.strip()
        defendant.append(defe)
    except:
        defendant.append('')
    #File Date
    try:
        fdate = c.find_element_by_xpath(f'//*[@id="{k}"]/table/tbody/tr[1]/td[4]').text.strip()
        date.append(fdate)
    except:
        fdate.append('')
    #Type
    try:
        ctype = c.find_element_by_xpath(f'//*[@id="{k}"]/table/tbody/tr[1]/td[5]').text.strip()
        ftype.append(ctype)
    except:
        ftype.append('')
    #Status
    try:
        fstatus = c.find_element_by_xpath(f'//*[@id="{k}"]/table/tbody/tr[1]/td[6]').text.strip()
        status.append(fstatus)
    except:
        status.append('')
        
    #The number is increased by 2 since only odd number contain the information in thie web page   
    nb += 2
        
driver.close()
print('Done')

#Convert all List into one dataframe
df = pd.DataFrame({
'Name':name,
'Address': address,
'DOB':dob,
'Race':race,
'Gender':gender,
'Case_ID': case_id,
'Defendant':defendant,
'File_Date':date,
'Tyep':ftype,
'Status':status
})

我的代码工作得很好,也许它可以更快/更简单。它可以抓取所有内容,但有一个异常,我的代码只能抓取第一条信息记录。如下图所示,第一个茉莉白有两个记录,而第二个茉莉白只有一个。我想添加/修改我的代码,以便能够抓取每条记录,无论他们有多少条记录,比如 1、2、3 条或更多记录。

如果我能对此提出一些建议,那就太好了!谢谢你们!!

在此处输入图像描述

4

0 回答 0