/python/

Review webpage comments

2016-12-11 12:45:16

Review webpage comments and_metadata for information leakage

It is very common, and even recommended, for programmers to include detailed comments and metadata on their source code. However, comments and metadata included into the HTML code might reveal internal information that should not be available to potential attackers. Comments and metadata review should be done in order to determine if any information is being leaked.

Simple python script, script will copy all comments from html and store them in a log file.

import urllib.request
import urllib.parse
import re

project = "test"
urlFile = project + "/url.txt"
filterUrl = project + "/filter.txt"

file_ = open('log.txt', 'w')
garbage = []

def get_content(_url):

    req = urllib.request.Request(_url, method='GET')

    try:
        response = urllib.request.urlopen(req)
        # print("Success: " + response.read().decode('utf8'))
        return response.read().decode('utf8')
    except urllib.error.URLError as error:
        # print("Error: " + error.read().decode('utf8'))
        print("Error: " + _url)
        return ""

def get_html_comments(_html):
    p = re.compile('<!--(.+?)-->')
    return p.findall(_html)

def log_line(_line):
    file_.write(_line)

def init_filter():
    with open(filterUrl) as filterFile:
        for one_filter in filterFile:
            garbage.append(one_filter.strip())

def is_valid(_content):
    return (_content.strip() in garbage) == False

init_filter()

with open(urlFile) as urlFile:
    for idxUrl, urlLine in enumerate(urlFile):
        try:
            url = urlLine.strip()
            log_line("---------- " + url + " ----------\n")
            content = get_content(url)
            comments = get_html_comments(content.strip().replace('\n', ''))
            for idx, comment in enumerate(comments):
                if is_valid(comment):
                    log_line(comment.strip() + "\n")
            log_line("--------------------------\n")
            print(idxUrl, url)
        except Exception as error:
            print(error)

/test/filter.txt

.col-md-12
.entry-meta
.entry-header
.entry-summary

/test/url.txt

http://www.blog.btbw.pl/
http://www.blog.btbw.pl/robots.txt
http://www.blog.btbw.pl/sitemap.xml
http://www.blog.btbw.pl/category/java/
http://www.blog.btbw.pl/category/java-script/angularjs/