mainz
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

get_wikipedia_page.py
text/x-python

Download raw (2.3 KB)

import wikipedia
import json
import os.path


# wiki_html = wikipediaapi.Wikipedia(language='en', extract_format=wikipediaapi.ExtractFormat.HTML)

wikipedia.set_lang("en")

CACHE_PATH = 'cached_pages/'

# --- cache system ---


def json_parsing(NAME_PATH):
    jsn = ""
    with open(NAME_PATH, 'r') as file:
        jsn = json.load(file)
    return jsn


def json_writing(NAME_PATH, data):
    with open(NAME_PATH, 'w') as file:
        # json_data = json.dumps(data, default=lambda o: o.__dict__,  indent=4)
        # print(json_data)
        json.dump(data, file, indent=4, default=lambda o: o.__dict__)


# --- wiki request ---

def get_wikipage(pagename, lang = "en"):
    # get wikipedia page content by name of the page
    
    try:
        results = wikipedia.search(pagename, results=1, suggestion=False)
        try:
            pagename = results[0]
        except IndexError:
            # if there is no suggestion or search results, the page doesn't exist
            raise wikipedia.PageError(pagename)
        return wikipedia.WikipediaPage(pagename, redirect=True, preload=True)
    except wikipedia.exceptions.DisambiguationError as e:
        print(e.options)
        page = ''

    return page

def request_page(request):

    # page = wikipedia.page(request)
    page = get_wikipage(request)

    #checking if the page exists
    # print("Page - Exists: %s" % page.exists())

    #cleaning the html content we get from inline style
    p_html = page.html()
    p_html = p_html.split("<")
    p_clean = []

    for t in p_html:
        if t.startswith("link") or t.startswith("style") or t == "":
            pass
        else:
            new_t = "<" + t
            p_clean.append(new_t)

    # adding the original request
    # and the content as custom attribute
    setattr(page, 'request', request)
    setattr(page, 'cleaned_content', ''.join(p_clean))

    # add it to the cache
    json_writing(CACHE_PATH + request + '.json', page)
    
    return page


def get_page(request):

    # # check if page exist in cache
    if os.path.isfile(CACHE_PATH + request + '.json'):
        print("--- FROM CACHE: " + request)
        page = json_parsing(CACHE_PATH + request + '.json')

    # # else request page through the module + API
    else:
        print("--- REQUEST: " + request)
        page = request_page(request)

    return page