Review webpage comments
2016-12-11 12:45:16
Review webpage comments and_metadata for information leakage
It is very common, and even recommended, for programmers to include detailed comments and metadata on their source code. However, comments and metadata included into the HTML code might reveal internal information that should not be available to potential attackers. Comments and metadata review should be done in order to determine if any information is being leaked.
Simple python script, script will copy all comments from html and store them in a log file.
import urllib.request
import urllib.parse
import re
project = "test"
urlFile = project + "/url.txt"
filterUrl = project + "/filter.txt"
file_ = open('log.txt', 'w')
garbage = []
def get_content(_url):
req = urllib.request.Request(_url, method='GET')
try:
response = urllib.request.urlopen(req)
# print("Success: " + response.read().decode('utf8'))
return response.read().decode('utf8')
except urllib.error.URLError as error:
# print("Error: " + error.read().decode('utf8'))
print("Error: " + _url)
return ""
def get_html_comments(_html):
p = re.compile('<!--(.+?)-->')
return p.findall(_html)
def log_line(_line):
file_.write(_line)
def init_filter():
with open(filterUrl) as filterFile:
for one_filter in filterFile:
garbage.append(one_filter.strip())
def is_valid(_content):
return (_content.strip() in garbage) == False
init_filter()
with open(urlFile) as urlFile:
for idxUrl, urlLine in enumerate(urlFile):
try:
url = urlLine.strip()
log_line("---------- " + url + " ----------\n")
content = get_content(url)
comments = get_html_comments(content.strip().replace('\n', ''))
for idx, comment in enumerate(comments):
if is_valid(comment):
log_line(comment.strip() + "\n")
log_line("--------------------------\n")
print(idxUrl, url)
except Exception as error:
print(error)
/test/filter.txt
/test/url.txt