I want to find out all the urls and its name from a html page using lxml.
I can parse the url and can find out this thing but is there any easy way from which I can find all the url links using lxml?
from lxml.html import parse
dom = parse('http://www.google.com/').getroot()
links = dom.cssselect('a')
from lxml import etree, cssselect, html
with open("/you/path/index.html", "r") as f:
fileread = f.read()
dochtml = html.fromstring(fileread)
select = cssselect.CSSSelector("a")
links = [ el.get('href') for el in select(dochtml) ]
links = iter(links)
for n, l in enumerate(links):
print n, l