The following Python script helps to get the HTML tag count of an HTML webpage given a URL:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import lxml.html | |
import urllib2 | |
def proc_root(root, tag_count): | |
try: | |
tag_count[root.tag] += 1 | |
except: | |
tag_count[root.tag] = 1 | |
for child in root: | |
proc_root(child, tag_count) | |
return tag_count | |
def get_tag_count(url): | |
tag_count = {} | |
res = urllib2.urlopen(url).read() | |
root = lxml.html.fromstring(res) | |
proc_root(root, tag_count) | |
return tag_count | |
def main(): | |
url = 'http://www.google.com' | |
tag_count = get_tag_count(url) | |
for tag, count in tag_count.items(): | |
print '%s\t%s' % (tag, count) | |
if __name__ == "__main__": | |
main() |
A sample output of the above script is given below:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
meta 1 | |
table 1 | |
font 1 | |
style 2 | |
span 8 | |
script 5 | |
tr 1 | |
html 1 | |
input 7 | |
td 3 | |
body 1 | |
head 1 | |
form 1 | |
nobr 2 | |
br 6 | |
a 20 | |
b 1 | |
center 1 | |
textarea 1 | |
title 1 | |
p 1 | |
u 1 | |
div 17 |
No comments:
Post a Comment