I'm trying to scrape several pages of a site with Selenium and Python, but my code is breaking over and over. I want to be able to enter the page number in the value box given at the bottom of every page. As of now my code does enter the page number but it breaks right after the new page is loaded. I've been able to scrape just the first page, and as soon as the second page loads, the code breaks.
Here's my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
driver = webdriver.Safari()
wait = WebDriverWait(driver, 1)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
call_names = {"Address": "Address", "State": "State", "City": "City", "Chief Commissioner of Income Tax Cadre Controlling Authority (CCIT- CCA) / DGIT (Exemptions)":"CCIT_DGIT_Exemptions", "Chief Commissioner of Income Tax (CCIT)":"CCIT", "Commissioner of Income Tax (CIT)": "CIT","Approved under Section": "Approved_under_Section", "Date of Order (DD/MM/YYYY)": "Date_of_order", "Date of Withdrawal/Cancellation (DD/MM/YYYY)":"Date_of_withdrawal", "Date of Expiry (DD/MM/YYYY)": "Date_of_Expiry", "Remarks": "Remarks"}
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,"faq-sub-content exempted-result"))):
listofIDstoScrape = []
name = elem.find_elements_by_class_name("fc-blue fquph")
pancard = elem.find_elements_by_class_name("pan-id")
details = driver.find_elements_by_class_name("exempted-detail")
for i in details:
pan = i.text
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'li')))
for n, p, key in zip(name, pancard, details):
main_list = {"Name": (n.text.replace(p.text,'')), "Pancard": p.text}
for elem_li in key.find_elements_by_tag_name("li"):
main_list[call_names [elem_li.find_element_by_tag_name('strong').text]] = elem_li.find_element_by_tag_name('span').text
print (main_list)
try:
for k in range(2,10):
myElem = WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_txtPageNumber")))
myElem.send_keys(str(k))
myElem.send_keys(Keys.RETURN)
print ("Page is ready!")
break
except TimeoutException:
print ("Loading took too much time!")
And here's the error:
--------------------------------------------------------------------------
---------------------------------------------------------------------------
---------------------------------------------------------------------------
StaleElementReferenceException Traceback (most recent call last)
<ipython-input-66-aa6debbcbeae> in <module>()
32
33 for elem_li in key.find_elements_by_tag_name("li"):
---> 34 main_list[call_names [elem_li.find_element_by_tag_name('strong').text]] = elem_li.find_element_by_tag_name('span').text
35
36 print (main_list)
/anaconda/lib/python3.6/site-packages/selenium/webdriver/remote/webelement.py in find_element_by_tag_name(self, name)
230 - name - name of html tag (eg: h1, a, span)
231 """
--> 232 return self.find_element(by=By.TAG_NAME, value=name)
233
234 def find_elements_by_tag_name(self, name):
/anaconda/lib/python3.6/site-packages/selenium/webdriver/remote/webelement.py in find_element(self, by, value)
516
517 return self._execute(Command.FIND_CHILD_ELEMENT,
--> 518 {"using": by, "value": value})['value']
519
520 def find_elements(self, by=By.ID, value=None):
/anaconda/lib/python3.6/site-packages/selenium/webdriver/remote/webelement.py in _execute(self, command, params)
499 params = {}
500 params['id'] = self._id
--> 501 return self._parent.execute(command, params)
502
503 def find_element(self, by=By.ID, value=None):
/anaconda/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params)
309 response = self.command_executor.execute(driver_command, params)
310 if response:
--> 311 self.error_handler.check_response(response)
312 response['value'] = self._unwrap_value(
313 response.get('value', None))
/anaconda/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response)
235 elif exception_class == UnexpectedAlertPresentException and 'alert' in value:
236 raise exception_class(message, screen, stacktrace, value['alert'].get('text'))
--> 237 raise exception_class(message, screen, stacktrace)
238
239 def _value_or_default(self, obj, key, default):
StaleElementReferenceException: Message: An element command failed because the referenced element is no longer available.
That's how the output looks like:
{'Name': 'INDIA INCLUSION FOUNDATION', 'Pancard': 'AABTI3598J', 'Address': 'No.250/1, 16th and 17th Cross, \nSampige Road, Malleshwaram,\nBangalore-560003.', 'State': 'KARNATAKA', 'City': 'BANGALORE', 'CCIT_DGIT_Exemptions': 'PR.CCIT BENGALURU', 'CCIT': 'CCIT(E) NEW DELHI', 'CIT': 'CIT(E) BENGALURU', 'Approved_under_Section': '12A', 'Date_of_order': '30/03/3017', 'Date_of_withdrawal': ' - ', 'Date_of_Expiry': ' - ', 'Remarks': ' - '}