diff --git a/admin/site/config.json b/admin/site/config.json --- a/admin/site/config.json +++ b/admin/site/config.json @@ -1,43 +1,58 @@ { "baseURL": "/", "languageCode": "en-gb", "title": "Archive of Formal Proofs", "theme": "afp", "publishDir": "..", "taxonomies": { "author": "authors", "topic": "topics", "dependency": "dependencies" }, "params": { "description": "A collection of proof libraries, examples, and larger scientific developments, mechanically checked in the theorem prover Isabelle.", "images": [ "/images/afp.png" ], "title": "Archive of Formal Proofs", "mainSections": ["entries"], "afpUrls": { "html": "https://www.isa-afp.org/browser_info/current/AFP" } }, "markup": { "goldmark": { "renderer": { "unsafe": true } } }, "menu": { "main": [ { "name": "Topics", "url": "/topics/", "weight": 2 } ] }, "outputs": { "home": ["html", "rss", "json"], "taxonomyTerm": ["html", "json"] + }, + "related": { + "includeNewer": true, + "indices": [ + { + "name": "topics", + "weight": 1 + }, + { + "name": "keywords", + "weight": 3 + } + ], + "threshold": 80, + "toLower": true } } \ No newline at end of file diff --git a/admin/site/themes/afp/layouts/entries/single.html b/admin/site/themes/afp/layouts/entries/single.html --- a/admin/site/themes/afp/layouts/entries/single.html +++ b/admin/site/themes/afp/layouts/entries/single.html @@ -1,129 +1,130 @@ {{- define "main" -}} {{- $site := . -}} {{- $entry_name := .File.BaseFileName -}}

Abstract

{{- trim .Params.abstract "\n" | safeHTML -}}
{{- if (eq .Params.licence "BSD") -}} BSD License {{- else if (eq .Params.license "LGPL") -}} GNU Lesser General Public License (LGPL) {{- else -}} {{- printf "%s License" .Params.license -}} {{- end -}} {{- with .Params.extra -}} {{- range $key, $value := . -}}

{{- humanize $key -}}

{{- $value | safeHTML -}}

{{- end -}} {{- end -}} {{- $Scratch := newScratch -}} {{- with .Params.dependencies -}}

Depends On

{{- end -}} {{- with (index .Site.Taxonomies.dependencies (urlize $entry_name)) -}}

Used by

{{- end -}} {{- with .Params.topics -}}

Topics

{{- end -}} - {{- with .Params.relatedEntries -}} -

Related Entries

+

Theories

- {{- end -}} -

Theories

- + {{ $related := .Site.RegularPages.Related . | first 3 }} + {{ with $related }} +

Related Entries

+ + {{ end }} +
-
{{- end -}} diff --git a/admin/sitegen-lib/keywords.py b/admin/sitegen-lib/keywords.py --- a/admin/sitegen-lib/keywords.py +++ b/admin/sitegen-lib/keywords.py @@ -1,65 +1,9 @@ -"""Generates a list of keywords for the search autocomplete. Each entry’s -abstract is sanitised and then the keywords are extracted with the RAKE -algorithm. -""" -import json -import os -import re -from itertools import groupby - -import unidecode -from rake_nltk import Rake -import nltk +"""Generates a list of keywords for the search autocomplete using RAKE.""" import RAKE -nltk.download('stopwords') -nltk.download('punkt') - - -def generate_keywords(entries_dir): - """RAKE is used to extract the keywords from every abstract. - - The top 8 keywords are added to a list of all keywords and the keywords - that appear in more than two abstracts are preserved. Finally, plurals - are removed.""" - - rake_object = Rake(max_length=2) - - replacements = [ - (r"\s+", " "), - (r"<.*?>", ""), - (r"[^\w\s/.()',-]", " "), - (r"\s+", " "), - ] - - keywords = [] - - for entry in os.listdir(entries_dir): - with open(os.path.join(entries_dir, entry)) as json_file: - data = json.load(json_file) - text = data["abstract"] - - for old, new in replacements: - text = re.sub(old, new, text) - - text = unidecode.unidecode(text) - - rake_object.extract_keywords_from_text(text) - keywords.extend(rake_object.get_ranked_phrases()) - - # keep keywords that appear in 2 or more abstracts - keywords = [i for i, c in groupby(sorted(keywords)) if len(list(c)) > 1] - - # remove plurals if we have the singular - for keyword in keywords: - if keyword + "s" in keywords: - keywords.remove(keyword + "s") - - return [{"id": i, "keyword": x} for i, x in enumerate(keywords)] - def print_keywords(text): rake = RAKE.Rake(RAKE.SmartStopList()) - res = rake.run(text, minCharacters=3, maxWords=3, minFrequency=1) + res = rake.run(text, minCharacters=3, maxWords=2, minFrequency=1) for keyword in res: print(keyword) diff --git a/admin/sitegen-lib/related.py b/admin/sitegen-lib/related.py deleted file mode 100644 --- a/admin/sitegen-lib/related.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -This script generates related entries, using three metrics: - * Sharing dependencies - * Sharing keywords - * Sharing keywords - -These are weighted and used to find entries which are likely similar. - -These are then added to the entries to improve site navigation. -""" -import json -import os - -from keywords import generate_keywords -from write_file import write_file - - -def add_related(entries_dir, keywords_file): - """ - First three dictionaries are created as follows: - - dependencies = {"dependency": [list-of-entries, ...], ...} - keywords = {"keyword": [list-of-entries, ...], ...} - topics = {"topic": [list-of-entries, ...], ...} - - Keywords that feature in more than 10 entries are dropped. Then - a dictionary is created with the relatedness scores between each - entry. Finally, the top three related entries are chosen for each - entry. - """ - - keywords = {} - - kws = generate_keywords(entries_dir) - with open(keywords_file, "w") as f: - json.dump(kws, f, ensure_ascii=False, separators=(",", ":")) - - for obj in kws: - keywords[obj["keyword"]] = [] - - - dependencies = {} - topics = {} - for entry in os.listdir(entries_dir): - shortname = entry[:-3] - - with open(os.path.join(entries_dir, entry)) as file: - data = json.load(file) - if "dependencies" in data: - for dep in data["dependencies"]: - if dep in dependencies: - dependencies[dep].append(shortname) - else: - dependencies[dep] = [shortname] - if "topics" in data: - for topic in data["topics"]: - if topic in topics: - topics[topic].append(shortname) - else: - topics[topic] = [shortname] - for keyword in keywords.keys(): - if keyword in data["abstract"].lower(): - keywords[keyword].append(shortname) - - for keyword, values in list(keywords.items()): - if len(values) > 10: - keywords.pop(keyword) - - related_entries = {} - - for dataSet in [(keywords, 1), (dependencies, 1.5), (topics, 0.5)]: - populate_related(dataSet[0], related_entries, dataSet[1]) - - for entry in related_entries: - for keyword, value in list(related_entries[entry].items()): - if value <= 2.5: - related_entries[entry].pop(keyword) - - final_related = {} - - for keyword, values in related_entries.items(): - final_related[keyword] = top_three(values) - - for entry, related in final_related.items(): - if related: - data = {"related": related} - write_file(os.path.join(entries_dir, entry + ".md"), data) - - -def populate_related(data, related, modifier=1): - """This is a heavily nested loop to create the relatedEntries dictionary. - - For each of the categories, the list of entries associated with - each key is iterated over twice and, if the entries are not the - same, the modifier of that category is added to the relatedness - score between the two entries in the dictionary. As the loop - iterates twice over the value set, the resulting dictionary is - bijective — i.e., the value for A->B will be equal to B->A. - """ - for _, entries in data.items(): - for keyEntry in entries: - for valueEntry in entries: - if valueEntry != keyEntry: - if keyEntry in related: - if valueEntry in related[keyEntry]: - related[keyEntry][valueEntry] += modifier - else: - related[keyEntry][valueEntry] = modifier - else: - related[keyEntry] = {valueEntry: modifier} - - -def top_three(dictionary): - """Returns the highest three dictionary keys by value""" - return sorted(dictionary, key=dictionary.get, reverse=True)[:3] diff --git a/admin/sitegen-req.txt b/admin/sitegen-req.txt --- a/admin/sitegen-req.txt +++ b/admin/sitegen-req.txt @@ -1,11 +1,10 @@ Jinja2==2.10 termcolor==1.1.0 pytz==2018.5 six==1.11.0 tqdm==4.62.3 unidecode==1.3.2 bs4==0.0.1 requests==2.26.0 lxml==4.6.3 -rake-nltk==1.0.6 python-rake==1.5.0 \ No newline at end of file diff --git a/tools/afp_site_gen.scala b/tools/afp_site_gen.scala --- a/tools/afp_site_gen.scala +++ b/tools/afp_site_gen.scala @@ -1,272 +1,264 @@ package afp import isabelle._ object AFP_Site_Gen { /* json */ object JSON { type T = isabelle.JSON.T object Object { type T = isabelle.JSON.Object.T } def from_authors(authors: List[Metadata.Author]): T = Metadata.TOML.from_authors(authors) def from_topics(topics: List[Metadata.Topic]): T = Metadata.TOML.from_topics(topics) def from_affiliations(affiliations: List[Metadata.Affiliation]): Object.T = { Utils.group_sorted(affiliations, (a: Metadata.Affiliation) => a.author).view.mapValues(author_affiliations => { val homepage = author_affiliations.collectFirst { case Metadata.Homepage(_, _, url) => url } val email = author_affiliations.collectFirst { case Metadata.Email(_, _, address) => address } isabelle.JSON.Object( homepage.map(s => List("homepage" -> s)).getOrElse(Nil) ::: email.map(s => List("email" -> s)).getOrElse(Nil): _*) }).toMap } def from_change_history(history: Metadata.Change_History): Object.T = { if (history.isEmpty) { Map.empty } else { Map("Change history" -> history.map { case (date, str) => "[" + date.toString + "] " + str }.mkString("\n")) } } def from_entry(entry: Metadata.Entry): JSON.Object.T = isabelle.JSON.Object( "title" -> entry.title :: "authors" -> entry.authors.map(_.author).distinct :: "affiliations" -> from_affiliations(entry.authors ++ entry.contributors) :: (if (entry.contributors.nonEmpty) "contributors" -> entry.contributors.map(_.author).distinct :: Nil else Nil) ::: "date" -> entry.date.toString :: "topics" -> entry.topics.map(_.id) :: "abstract" -> entry.`abstract` :: "license" -> entry.license :: (if (entry.releases.nonEmpty) "releases" -> entry.releases.map(r => r.isabelle -> r.date.toString).toMap :: Nil else Nil) ::: (if (entry.note.nonEmpty) "note" -> entry.note :: Nil else Nil) ::: (if (entry.change_history.nonEmpty || entry.extra.nonEmpty) "extra" -> (from_change_history(entry.change_history) ++ entry.extra) :: Nil else Nil): _*) + + def from_keywords(keywords: List[String]): T = + keywords.zipWithIndex.map + { case (keyword, i) => isabelle.JSON.Object("id" -> i, "keyword" -> keyword) } } /* keyword extraction */ private val replacements = List( "<[^>]*>".r -> "", "[^\\w\\s()'.,;:-]".r -> " ", "\\s+".r -> " ") def extract_keywords(text: String): List[String] = { val stripped_text = replacements.foldLeft(text) { case (res, (regex, replacement)) => regex.replaceAllIn(res, replacement) } val arg = quote(stripped_text.replaceAll("\"", "\\\"")) val keyword_cmd = "from keywords import *; print_keywords(" + arg + ")" Python.run(keyword_cmd).check.out_lines } /* site generation */ def afp_site_gen( out_dir: Option[Path], layout: Hugo.Layout, afp_structure: AFP_Structure, progress: Progress = new Progress()): Unit = { /* add authors */ progress.echo("Preparing authors...") val authors = afp_structure.load_authors val authors_by_id = Utils.grouped_sorted(authors, (a: Metadata.Author) => a.id) layout.write_data(Path.basic("authors.json"), JSON.from_authors(authors)) /* add topics */ progress.echo("Preparing topics...") val topics = afp_structure.load_topics def sub_topics(topic: Metadata.Topic): List[Metadata.Topic] = topic :: topic.sub_topics.flatMap(sub_topics) val topics_by_id = Utils.grouped_sorted(topics.flatMap(sub_topics), (t: Metadata.Topic) => t.id) layout.write_data(Path.basic("topics.json"), JSON.from_topics(topics)) /* add releases */ progress.echo("Preparing releases...") val releases_by_entry = afp_structure.load_releases.groupBy(_.entry) /* extract keywords */ progress.echo("Extracting keywords...") var seen_keywords = Set.empty[String] val entry_keywords = afp_structure.entries.map(name => { val entry = afp_structure.load_entry(name, authors_by_id, topics_by_id, releases_by_entry) val Keyword = """\('([^']*)', ([^)]*)\)""".r val scored_keywords = extract_keywords(entry.`abstract`).map { case Keyword(keyword, score) => keyword -> score.toDouble case s => error("Could not parse: " + s) } seen_keywords ++= scored_keywords.map(_._1) name -> scored_keywords.filter(_._2 > 1.0).map(_._1) }).toMap seen_keywords = seen_keywords.filter(k => !k.endsWith("s") || !seen_keywords.contains(k.stripSuffix("s"))) + layout.write_static(Path.make(List("data", "keywords.json")), JSON.from_keywords(seen_keywords.toList)) def get_keywords(name: Metadata.Entry.Name): List[String] = entry_keywords(name).filter(seen_keywords.contains).take(8) /* add entries and theory listings */ progress.echo("Preparing entries...") val sessions_structure = afp_structure.sessions_structure val sessions_deps = Sessions.deps(sessions_structure) for (name <- afp_structure.entries) { val entry = afp_structure.load_entry(name, authors_by_id, topics_by_id, releases_by_entry) val deps = for { session <- afp_structure.entry_sessions(name) dep <- sessions_structure.imports_graph.imm_preds(session.name) if session.name != dep && sessions_structure(dep).groups.contains("afp") } yield dep val topo_theories = for { session <- afp_structure.entry_sessions(name) base = sessions_deps(session.name) node <- base.session_theories } yield node.theory_base_name val entry_json = JSON.from_entry(entry) ++ isabelle.JSON.Object( "dependencies" -> deps.distinct, "theories" -> topo_theories, "aliases" -> List("/entries/" + name + ".html"), "keywords" -> get_keywords(name)) val theories_json = isabelle.JSON.Object( "url" -> ("/entries/" + name.toLowerCase + "/theories"), "theories" -> topo_theories) layout.write_content(Path.make(List("entries", name + ".md")), entry_json) layout.write_content(Path.make(List("theories", name + ".md")), theories_json) } - /* add related entries and keywords */ - - progress.echo("Preparing related entries...") - - val entries_dir = layout.content_dir + Path.basic("entries") - val keywords_file = layout.static_dir + Path.make(List("data", "keywords.json")) - keywords_file.dir.file.mkdirs() - val related_cmd = "from related import *; add_related(" + - commas_quote(List(entries_dir.implode, keywords_file.implode)) + - ")" - Python.run(related_cmd).check - - /* add statistics */ progress.echo("Preparing statistics...") val statistics_cmd = "from statistics import *; add_statistics(" + commas_quote( List( Path.explode("$AFP_BASE").absolute.implode, Path.explode("$AFP").absolute.implode, layout.data_dir.implode)) + ")" Python.run(statistics_cmd).check (layout.data_dir + Path.basic("statistics.html")).file.delete() /* project */ progress.echo("Preparing project files") layout.copy_project() /* hugo */ out_dir match { case Some(out_dir) => progress.echo("Building site...") Hugo.build(layout, out_dir).check progress.echo("Finished building site") case None => progress.echo("Finished sitegen preparation.") } } val isabelle_tool = Isabelle_Tool("afp_site_gen", "generates afp website source", Scala_Project.here, args => { var base_dir = Path.explode("$AFP_BASE") var hugo_dir = base_dir + Path.make(List("web", "hugo")) var out_dir: Option[Path] = None val getopts = Getopts(""" Usage: isabelle afp_site_gen [OPTIONS] Options are: -B DIR afp base dir (default """" + base_dir.implode + """") -H DIR generated hugo project dir (default """" + hugo_dir.implode + """") -O DIR output dir (default none) Generates the AFP website source. HTML files of entries are dynamically loaded. Site will be built from generated source if output dir is specified. """, "B:" -> (arg => base_dir = Path.explode(arg)), "H:" -> (arg => hugo_dir = Path.explode(arg)), "O:" -> (arg => out_dir = Some(Path.explode(arg)))) getopts(args) val afp_structure = AFP_Structure(base_dir) val layout = Hugo.Layout(hugo_dir) val progress = new Console_Progress() progress.echo("Preparing site generation in " + hugo_dir.implode) afp_site_gen(out_dir = out_dir, layout = layout, afp_structure = afp_structure, progress = progress) }) } \ No newline at end of file diff --git a/tools/hugo.scala b/tools/hugo.scala --- a/tools/hugo.scala +++ b/tools/hugo.scala @@ -1,80 +1,80 @@ package afp import isabelle._ object Hugo { val hugo_home = Isabelle_System.getenv("ISABELLE_HUGO") val hugo_static = Path.explode("$AFP_BASE") + Path.make(List("admin", "site")) class Layout private(private[Hugo] val src_dir: Path) { private def write(file: Path, content: String): Unit = { val path = src_dir + file if (!path.dir.file.exists()) path.dir.file.mkdirs() File.write(path, content) } val data_dir = src_dir + Path.basic("data") def write_data(file: Path, content: JSON.T): Unit = write(Path.basic("data") + file, isabelle.JSON.Format(content)) val content_dir = src_dir + Path.basic("content") def write_content(file: Path, content: JSON.T): Unit = write(Path.basic("content") + file, isabelle.JSON.Format(content)) - def write_asset(file: Path, content: String): Unit = - write(Path.basic("assets") + file, content) - val static_dir = src_dir + Path.basic("static") + + def write_static(file: Path, content: JSON.T): Unit = + write(Path.basic("static") + file, isabelle.JSON.Format(content)) def copy_project(): Unit = { if (hugo_static.canonical != src_dir) { for { file <- List( List("content", "_index.md"), List("content", "about.md"), List("content", "contribution.md"), List("content", "download.md"), List("content", "help.md"), List("content", "search.md"), List("content", "statistics.md"), List("content", "submission.md"), List("themes"), List("theories"), List("config.json")) } Isabelle_System.copy_dir(hugo_static + Path.make(file), src_dir) } } } object Layout { def apply(src_dir: Path = Path.explode("$AFP_BASE") + Path.make(List("web", "hugo"))): Layout = new Layout(src_dir.canonical) } private lazy val exec = Path.explode(proper_string(hugo_home).getOrElse(error("No hugo component found"))) + Path.basic("hugo") def build(layout: Layout, out_dir: Path = Path.explode("$AFP_BASE") + Path.basic("web")): Process_Result = { Isabelle_System.bash( exec.implode + " -s " + quote(layout.src_dir.implode) + " -d " + quote(out_dir.canonical.implode)) } def watch(layout: Layout, out_dir: Path = Path.explode("$AFP_BASE") + Path.basic("web"), progress: Progress = new Progress()): Process_Result = { Isabelle_System.bash( exec.implode + " server -s " + quote(layout.src_dir.implode) + " -d " + quote(out_dir.canonical.implode), progress_stdout = progress.echo, progress_stderr = progress.echo_warning) } } \ No newline at end of file