我正在为这个网站上的所有 App.No 抓取:WIPO。我正在通过这个网站抓取内容,我似乎无法循环浏览我想要的最后一页(100),即使我的代码确实点击到了下一页。此外,我拉出的内容只是第一页的重复内容。在弹出错误消息之前,我经历的最远循环是 12 页。并且出于某种原因,即使使用相同的代码,每次它都停在不同的页面上?
StaleElementReferenceException:消息:过时的元素引用:元素未附加到页面文档
我理解这意味着找不到我的路径,但是通过更改url无法访问网站,但是我在不同的页面上查看了路径,它们没有改变,所以我不知道如何移动向前。我的代码看起来像这样。如果有人可以帮忙?
class autoScraper():
def __init__(self,ep="./chromedriver",headless=False):
options = webdriver.ChromeOptions()
if headless: options.add_argument("--headless");
options.add_argument("--start-maximized")
self.driver= webdriver.Chrome(executable_path=ep,options=options);
def closeDriver(self):
self.driver.close()
def next_page(self):
# btn=self.driver.find_elements_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[2]/div/div[2]/div/a/span')
btn=self.driver.find_elements_by_css_selector('a[title="Next Page"]')
if len(btn)>0:
btn[0].click()
def connector(self,a="https://patentscope.wipo.int/search/en/search.jsf"):
success = False;
try:
self.driver.get(a)
self.driver.find_element_by_xpath('/html/body/div[2]/div[5]/div/div[2]/form/div/div[1]/div[2]/div/div/div[1]/div[2]/button').click()
self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[2]/div/select[1]').click() # cilck to select the num of showing IPs on a page.
self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[2]/div/select[1]/option[4]').click()
self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[1]/div/select[1]/option[2]').click()
self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[2]/div/select[1]/option[4]').click()
self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[1]/div/div[1]/div[3]/div/select[1]/option[2]').click()
success = True
except Exception as e:
print(e)
if success:
return success
def getPCT(self):
PCT = []
for i in range(1,201):
no = self.driver.find_element_by_xpath('/html/body/div[2]/div[4]/div/div[1]/div[2]/div/form[2]/div/div[1]/div/div/table/tbody/tr[%d]/td/div/div[2]/div/div[1]/span[2]/span[2]' %(i)).text
PCT.append(no)
return PCT
def clickNextPage(self):
self.driver.find_element_by_css_selector('a[title="Next Page"]').click()
if __name__ == '__main__':
PCT=[]
driver = autoScraper()
if driver.connector():
sleep(10)
while i<100:
i=i+1
PCT=driver.getPCT()
driver.clickNextPage()
driver.next_page()
print('The num of scraped PCTs:',len(PCT))
try:
os.system('mkdir ./download/')
except:
print('The directory is already existed.')
finally:
with open('./download/pct.txt','a') as f:
for line in PCT:
f.write(line+'\n')
print('urls writen to ./download/pct.txt')
driver.closeDriver()