/python/

Custom dictionary file based on the content of the website

2017-01-04 22:55:47

url.txt

http://company-test.com/about-as
http://company-test.com/company
http://company-test.com/stuff

tool.py

import urllib.request
import urllib.parse
import re
import unicodedata

urlFile = "url.txt"
file_ = open('log.txt', 'w')

def get_content(_url):
    req = urllib.request.Request(_url, method='GET')

    try:
        response = urllib.request.urlopen(req)
        return response.read().decode('utf8')
    except urllib.error.URLError as error:
        print("Error: " + _url)
        return ""

def remove_script(data):
    pattern = re.compile(r'\s?on\w+="[^"]+"\s?')
    result = re.sub(pattern, "", data)
    pattern2 = re.compile(r'<script[\s\S]+?/script>')
    result = re.sub(pattern2, "", result)
    return result

def get_plain_text(data):
    data = remove_script(data)
    data = re.sub('<[^>]*>', '', data)
    data = data.replace("ó", "ó")
    data = data.replace("-->", "")
    data = data.replace("<--", "")
    data = data.replace("|", "")
    return data

def get_uniq_words(text):
    words = re.findall('\w+', plain_text.lower())
    return set(words)

def remove_shor_words(text):
    shortword = re.compile(r'\W*\b\w{1,3}\b')
    return shortword.sub('', text)

def strip_accents(s):
    data = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    data = data.replace("ł", "l")
    return data

def log_line(_line):
    file_.write(_line)

WORDS = {}

with open(urlFile) as urlFile:
    for idxUrl, urlLine in enumerate(urlFile):
        try:
            url = urlLine.strip()

            content = get_content(url)
            plain_text = get_plain_text(content)
            plain_text = remove_shor_words(plain_text)
            uniq_words = get_uniq_words(plain_text)

            WORDS = set(set(WORDS) | set(uniq_words))

        except Exception as error:
            print(error)

    for word in enumerate(WORDS):
        log_line(strip_accents(word[1])+"\n")
    print(str(len(WORDS)) + " unique words")