diff --git a/admin/sitegen-lib/dependencies.py b/admin/sitegen-lib/dependencies.py new file mode 100644 --- /dev/null +++ b/admin/sitegen-lib/dependencies.py @@ -0,0 +1,25 @@ +""" +The dependencies of an AFP entry are listed in the ROOT file, and as it +is regular, this script uses a regular expression to extract the dependencies +and adds them to the JSON file of the entry. +""" +import json +import os + +from write_file import write_file + + +def add_dependencies(entries_dir, dependencies_file): + """For each entry in the thys/ directory, extract the dependencies and add + them to the JSON file.""" + + with open(dependencies_file) as dep: + dependencies = json.load(dep) + + for entry in os.listdir(entries_dir): + shortname = entry[:-3] + entry_deps = dependencies[shortname] + afp_deps = entry_deps["afp_deps"] + + data = {"dependencies": afp_deps} + write_file(os.path.join(entries_dir, entry), data) diff --git a/admin/sitegen-lib/keywords.py b/admin/sitegen-lib/keywords.py new file mode 100644 --- /dev/null +++ b/admin/sitegen-lib/keywords.py @@ -0,0 +1,57 @@ +"""Generates a list of keywords for the search autocomplete. Each entry’s +abstract is sanitised and then the keywords are extracted with the RAKE +algorithm. +""" +import json +import os +import re +from itertools import groupby + +import unidecode +from rake_nltk import Rake +import nltk + +nltk.download('stopwords') +nltk.download('punkt') + + +def generate_keywords(entries_dir): + """RAKE is used to extract the keywords from every abstract. + + The top 8 keywords are added to a list of all keywords and the keywords + that appear in more than two abstracts are preserved. Finally, plurals + are removed.""" + + rake_object = Rake(max_length=2) + + replacements = [ + (r"\s+", " "), + (r"<.*?>", ""), + (r"[^\w\s/.()',-]", " "), + (r"\s+", " "), + ] + + keywords = [] + + for entry in os.listdir(entries_dir): + with open(os.path.join(entries_dir, entry)) as json_file: + data = json.load(json_file) + text = data["abstract"] + + for old, new in replacements: + text = re.sub(old, new, text) + + text = unidecode.unidecode(text) + + rake_object.extract_keywords_from_text(text) + keywords = rake_object.get_ranked_phrases() + + # keep keywords that appear in 2 or more abstracts + keywords = [i for i, c in groupby(sorted(keywords)) if len(list(c)) > 1] + + # remove plurals if we have the singular + for keyword in keywords: + if keyword + "s" in keywords: + keywords.remove(keyword + "s") + + return [{"id": i, "keyword": x} for i, x in enumerate(keywords)] diff --git a/admin/sitegen-lib/related.py b/admin/sitegen-lib/related.py new file mode 100644 --- /dev/null +++ b/admin/sitegen-lib/related.py @@ -0,0 +1,111 @@ +""" +This script generates related entries, using three metrics: + * Sharing dependencies + * Sharing keywords + * Sharing keywords + +These are weighted and used to find entries which are likely similar. + +These are then added to the entries to improve site navigation. +""" +import json +import os + +from keywords import generate_keywords +from write_file import write_file + + +def add_related(entries_dir): + """ + First three dictionaries are created as follows: + + dependencies = {"dependency": [list-of-entries, ...], ...} + keywords = {"keyword": [list-of-entries, ...], ...} + topics = {"topic": [list-of-entries, ...], ...} + + Keywords that feature in more than 10 entries are dropped. Then + a dictionary is created with the relatedness scores between each + entry. Finally, the top three related entries are chosen for each + entry. + """ + + keywords = {} + + for obj in generate_keywords(entries_dir): + keywords[obj["keyword"]] = [] + + + dependencies = {} + topics = {} + for entry in os.listdir(entries_dir): + shortname = entry[:-3] + + with open(os.path.join(entries_dir, entry)) as file: + data = json.load(file) + if "dependencies" in data: + for dep in data["dependencies"]: + if dep in dependencies: + dependencies[dep].append(shortname) + else: + dependencies[dep] = [shortname] + if "topics" in data: + for topic in data["topics"]: + if topic in topics: + topics[topic].append(shortname) + else: + topics[topic] = [shortname] + for keyword in keywords.keys(): + if keyword in data["abstract"].lower(): + keywords[keyword].append(shortname) + + for keyword, values in list(keywords.items()): + if len(values) > 10: + keywords.pop(keyword) + + related_entries = {} + + for dataSet in [(keywords, 1), (dependencies, 1.5), (topics, 0.5)]: + populate_related(dataSet[0], related_entries, dataSet[1]) + + for entry in related_entries: + for keyword, value in list(related_entries[entry].items()): + if value <= 2.5: + related_entries[entry].pop(keyword) + + final_related = {} + + for keyword, values in related_entries.items(): + final_related[keyword] = top_three(values) + + for entry, related in final_related.items(): + if related: + data = {"related": related} + write_file(os.path.join(entries_dir, entry + ".md"), data) + + +def populate_related(data, related, modifier=1): + """This is a heavily nested loop to create the relatedEntries dictionary. + + For each of the categories, the list of entries associated with + each key is iterated over twice and, if the entries are not the + same, the modifier of that category is added to the relatedness + score between the two entries in the dictionary. As the loop + iterates twice over the value set, the resulting dictionary is + bijective — i.e., the value for A->B will be equal to B->A. + """ + for _, entries in data.items(): + for keyEntry in entries: + for valueEntry in entries: + if valueEntry != keyEntry: + if keyEntry in related: + if valueEntry in related[keyEntry]: + related[keyEntry][valueEntry] += modifier + else: + related[keyEntry][valueEntry] = modifier + else: + related[keyEntry] = {valueEntry: modifier} + + +def top_three(dictionary): + """Returns the highest three dictionary keys by value""" + return sorted(dictionary, key=dictionary.get, reverse=True)[:3] diff --git a/admin/sitegen-lib/statistics.py b/admin/sitegen-lib/statistics.py new file mode 100644 --- /dev/null +++ b/admin/sitegen-lib/statistics.py @@ -0,0 +1,53 @@ +""" +Most the statistics for the site, are generated by Hugo. This script, +generates other statistics like number of lines in the AFP using the +scripts from the current AFP. + +For this script to work, `return data` needs to be added at +line 212 in templates.py +""" + +import os + +import afpstats +import metadata +import templates +from config import options +from sitegen import associate_releases, parse, read_versions +from write_file import write_file + + +def add_statistics(base_dir, thys_dir, data_dir): + """Creates the necessary objects to generates the statistics, + then outputs them to the data directory""" + options.templates_dir = os.path.join(base_dir, "metadata", "templates") + options.dest_dir = data_dir + + entries = parse(os.path.join(base_dir, "metadata", "metadata")) + versions = read_versions(os.path.join(base_dir, "metadata", "release-dates")) + associate_releases(entries, versions, os.path.join(base_dir, "metadata", "releases")) + + deps_dict = metadata.empty_deps(entries) + + afp_dict = afpstats.afp_dict(entries, thys_dir, deps_dict) + afp_dict.build_stats() + builder = templates.Builder(options, entries, afp_dict) + + stats = builder.generate_statistics() + + loc_articles = [article.loc for article in stats["articles_by_time"]] + + all_articles = [a.name for a in stats["articles_by_time"]] + + data = { + "num_lemmas": stats["num_lemmas"], + "num_loc": stats["num_loc"], + "articles_year": stats["articles_year"], + "loc_years": stats["loc_years"], + "author_years": stats["author_years"], + "author_years_cumulative": stats["author_years_cumulative"], + "loc_articles": loc_articles, + "all_articles": all_articles, + } + + write_file(os.path.join(data_dir, "statistics.json"), data) diff --git a/admin/sitegen-lib/templates.py b/admin/sitegen-lib/templates.py --- a/admin/sitegen-lib/templates.py +++ b/admin/sitegen-lib/templates.py @@ -1,230 +1,231 @@ from collections import OrderedDict from itertools import groupby import os import datetime from jinja2 import Environment, FileSystemLoader import terminal ### topics class Tree(object): def __init__(self): self.subtopics = OrderedDict() self.entries = [] def add_topic(self, topic): if len(topic) > 0: if topic[0] not in self.subtopics: tree = Tree() self.subtopics[topic[0]] = tree else: tree = self.subtopics[topic[0]] tree.add_topic(topic[1:]) def add_to_topic(self, topic, entry): if len(topic) > 0: if topic[0] not in self.subtopics: terminal.error(u"In entry {0}: unknown (sub)topic {1}".format(entry, topic), abort=True) else: self.subtopics[topic[0]].add_to_topic(topic[1:], entry) else: self.entries.append(entry) def __str__(self): return self._to_str() def _to_str(self, indent=0): indent_str = ' ' * indent result = indent_str + str(self.entries) + "\n" for subtopic, tree in self.subtopics.items(): result += indent_str result += subtopic result += "\n" result += tree._to_str(indent + 2) return result def read_topics(filename): tree = Tree() stack = [] with open(filename) as f: for line in f: count = 0 while line[count] == ' ': count += 1 if count % 2: raise Exception(u"Illegal indentation at line '{0}'".format(line)) level = count // 2 if level <= len(stack): stack = stack[0:level] else: raise Exception(u"Illegal indentation at line '{0}'".format(line)) stack.append(line[count:len(line)-1]) tree.add_topic(stack) return tree # for topics page: group entries by topic def collect_topics(entries, metadata_dir): tree = read_topics(os.path.join(metadata_dir, "topics")) for entry, attributes in entries.items(): for topic in attributes['topic']: tree.add_to_topic([s.strip() for s in topic.split('/')], entry) return tree class Builder(): """Contains environment for building webpages from templates""" def __init__(self, options, entries, afp_entries): self.j2_env = Environment(loader=FileSystemLoader(options.templates_dir), trim_blocks=True) # pass functions to environment for use in templates self.prepare_env() self.options = options #TODO: use only afp_entries self.entries = entries self.afp_entries = afp_entries def prepare_env(self): def startswith(value, beginning): return value.startswith(beginning) def datetimeformat(value, format_str='%Y-%m-%d'): return value.strftime(format_str) def rfc822(value): # Locale could be something different than english, to prevent printing # non english months, we use this fix month = "Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec".split(" ")[value.month - 1] return value.strftime("%d " + month + " %Y %T %z") def split(value): return value.split() def short_month(value): return "jan feb mar apr may jun jul aug sep oct nov dec".split(" ")[value - 1] self.j2_env.filters['startswith'] = startswith self.j2_env.filters['datetimeformat'] = datetimeformat self.j2_env.filters['rfc822'] = rfc822 self.j2_env.filters['split'] = split self.j2_env.filters['short_month'] = short_month def write_file(self, filename, template, values): # UTF-8 hack because of different string handling in python 2 vs 3 with open(os.path.join(self.options.dest_dir, filename), 'wb') as f: f.write(template.render(values).encode('utf8')) def generate_standard(self, filename, template_name): template = self.j2_env.get_template(template_name) self.write_file(filename, template, {}) terminal.success("Generated {}".format(filename)) def generate_topics(self): tree = collect_topics(self.entries, self.options.metadata_dir) template = self.j2_env.get_template("topics.tpl") self.write_file("topics.html", template, {'tree': tree}) terminal.success("Generated topics.html") def generate_index(self): data = {'is_devel': self.options.is_devel} by_year = groupby(sorted(self.afp_entries.values(), key=lambda e: (e.publish_date, e.name), reverse=True), key=lambda e: e.publish_date.year) data['by_year'] = [(year, list(entries)) for year, entries in by_year] template = self.j2_env.get_template("index.tpl") self.write_file("index.html", template, data) terminal.success("Generated index.html") def generate_entries(self): counter = 0 template = self.j2_env.get_template("entry.tpl") for name, entry in self.afp_entries.items(): self.write_file(os.path.join("entries", name + ".html"), template, {'entry': entry, 'is_devel': self.options.is_devel, 'ROOT_PATH': '../'}) counter += 1 for name, entry in self.afp_entries.no_index.items(): self.write_file(os.path.join("entries", name + ".html"), template, {'entry': entry, 'is_devel': self.options.is_devel, 'ROOT_PATH': '../'}) counter += 1 terminal.success("Generated html files for {:d} entries".format(counter)) def generate_download(self): template = self.j2_env.get_template("download.tpl") self.write_file("download.html", template, {'is_devel': self.options.is_devel}) terminal.success("Generated download.html") def generate_statistics(self): #TODO: simplify with itertools # Count loc and articles per year articles_years = dict() loc_years = dict() for article in self.afp_entries.values(): try: articles_years[article.publish_date.year] += 1 loc_years[article.publish_date.year] += article.loc except KeyError: articles_years[article.publish_date.year] = 1 loc_years[article.publish_date.year] = article.loc # Count new authors per year author_years = dict.fromkeys(articles_years.keys(), 0) for author in self.afp_entries.authors.values(): first_year = min([e.publish_date.year for e in author.articles]) try: author_years[first_year] += 1 except KeyError: author_years[first_year] = 1 # Build cumulative values author_years_cumulative = author_years.copy() for y in sorted(articles_years)[1:]: articles_years[y] += articles_years[y - 1] loc_years[y] += loc_years[y - 1] author_years_cumulative[y] += author_years_cumulative[y - 1] data = {'entries': self.afp_entries} data['num_lemmas'] = sum([a.lemmas for a in self.afp_entries.values()]) data['num_loc'] = sum([a.loc for a in self.afp_entries.values()]) data['years'] = sorted(articles_years) data['articles_year'] = [articles_years[y] for y in sorted(articles_years)] data['loc_years'] = [round(loc_years[y], -2) for y in sorted(loc_years)] data['author_years'] = [author_years[y] for y in sorted(author_years)] data['author_years_cumulative'] = [author_years_cumulative[y] for y in sorted(author_years_cumulative)] # Find 10 most imported entries, entries with the same number of # imports share one place. most_used = sorted([a for a in self.afp_entries.values()], key=lambda x: (-len(x.used), x.name)) # Show more than 10 articles but not more than necessary i = 0 while (i < 10 or (i + 1 < len(most_used) and len(most_used[i].used) == len(most_used[i + 1].used))): i += 1 # Groupby iterators trigger some obscure bug in jinja2 # https://github.com/pallets/jinja/issues/555 # So don't use groupby iterator directly and convert to list of lists data['most_used'] = [(len_used, list(articles)) for (len_used, articles) in groupby(most_used[:i + 1], key=lambda x: len(x.used))] data['articles_by_time'] = sorted(self.afp_entries.values(), key=lambda x: x.publish_date) data['articles_per_year'] = [(year, list(articles)) for (year, articles) in groupby(data['articles_by_time'], key=lambda x: x.publish_date.year)] template = self.j2_env.get_template("statistics.tpl") self.write_file("statistics.html", template, data) terminal.success("Generated statistics.html") + return data def generate_status(self, build_data): template = self.j2_env.get_template("status.tpl") self.write_file("status.html", template, {'entries': [self.afp_entries[e] for e in sorted(self.afp_entries)], 'build_data': build_data}) terminal.success("Generated status.html") def generate_rss(self, num_entries): entries = sorted(self.afp_entries.values(), key=lambda e: (e.publish_date, e.name), reverse=True) template = self.j2_env.get_template("rss.tpl") self.write_file("rss.xml", template, {'entries': entries[:num_entries]}) terminal.success("Generated rss.xml") diff --git a/admin/sitegen-lib/write_file.py b/admin/sitegen-lib/write_file.py new file mode 100644 --- /dev/null +++ b/admin/sitegen-lib/write_file.py @@ -0,0 +1,18 @@ +import json +import os + + +def write_file(file, data, write=True, overwrite=False): + file_exists = os.path.isfile(file) + + if file_exists and not overwrite: + with open(file) as r: + original_data = json.load(r) + + data = {**original_data, **data} + + # Only write file if write or if file doesn't exist + # Or, only don't write if write is false and file exists + if not file_exists or write: + with open(file, "w", encoding="utf-8") as w: + json.dump(data, w, ensure_ascii=False, indent=4)