找到所有叶子标签并更改它们的字符串。
alphabet = 'abcdefghijklmnopqrtsuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
def replace(soup):
for child in soup.children:
if child.string:
child.string = ''.join([ch for ch in child.string if ch in alphabet])
else:
replace(child)
from bs4 import BeautifulSoup
orig_string = """
<div class="abc bcd">
<div class="inner1"> Hai ! this is first inner div;</div>
<div class="inner2"> "this is second div... " </div>
</div> """
soup = BeautifulSoup(orig_string)
print soup.prettify() # original HTML
replace(soup)
print
print soup.prettify() # new HTML
输出:
<html>
<body>
<div class="abc bcd">
<div class="inner1">
Hai ! this is first inner div;
</div>
<div class="inner2">
"this is second div... "
</div>
</div>
</body>
</html>
<html>
<body>
<div class="abc bcd">
<div class="inner1">
Haithisisfirstinnerdiv
</div>
<div class="inner2">
thisisseconddiv
</div>
</div>
</body>
</html>