diff --git a/admin/sitegen-lib/keywords.py b/admin/sitegen-lib/keywords.py --- a/admin/sitegen-lib/keywords.py +++ b/admin/sitegen-lib/keywords.py @@ -1,57 +1,57 @@ """Generates a list of keywords for the search autocomplete. Each entry’s abstract is sanitised and then the keywords are extracted with the RAKE algorithm. """ import json import os import re from itertools import groupby import unidecode from rake_nltk import Rake import nltk nltk.download('stopwords') nltk.download('punkt') def generate_keywords(entries_dir): """RAKE is used to extract the keywords from every abstract. The top 8 keywords are added to a list of all keywords and the keywords that appear in more than two abstracts are preserved. Finally, plurals are removed.""" rake_object = Rake(max_length=2) replacements = [ (r"\s+", " "), (r"<.*?>", ""), (r"[^\w\s/.()',-]", " "), (r"\s+", " "), ] keywords = [] for entry in os.listdir(entries_dir): with open(os.path.join(entries_dir, entry)) as json_file: data = json.load(json_file) text = data["abstract"] for old, new in replacements: text = re.sub(old, new, text) text = unidecode.unidecode(text) rake_object.extract_keywords_from_text(text) - keywords = rake_object.get_ranked_phrases() + keywords.extend(rake_object.get_ranked_phrases()) # keep keywords that appear in 2 or more abstracts keywords = [i for i, c in groupby(sorted(keywords)) if len(list(c)) > 1] # remove plurals if we have the singular for keyword in keywords: if keyword + "s" in keywords: keywords.remove(keyword + "s") return [{"id": i, "keyword": x} for i, x in enumerate(keywords)] diff --git a/admin/sitegen-lib/related.py b/admin/sitegen-lib/related.py --- a/admin/sitegen-lib/related.py +++ b/admin/sitegen-lib/related.py @@ -1,111 +1,115 @@ """ This script generates related entries, using three metrics: * Sharing dependencies * Sharing keywords * Sharing keywords These are weighted and used to find entries which are likely similar. These are then added to the entries to improve site navigation. """ import json import os from keywords import generate_keywords from write_file import write_file -def add_related(entries_dir): +def add_related(entries_dir, keywords_file): """ First three dictionaries are created as follows: dependencies = {"dependency": [list-of-entries, ...], ...} keywords = {"keyword": [list-of-entries, ...], ...} topics = {"topic": [list-of-entries, ...], ...} Keywords that feature in more than 10 entries are dropped. Then a dictionary is created with the relatedness scores between each entry. Finally, the top three related entries are chosen for each entry. """ keywords = {} - for obj in generate_keywords(entries_dir): + kws = generate_keywords(entries_dir) + with open(keywords_file, "w") as f: + json.dump(kws, f, ensure_ascii=False, separators=(",", ":")) + + for obj in kws: keywords[obj["keyword"]] = [] dependencies = {} topics = {} for entry in os.listdir(entries_dir): shortname = entry[:-3] with open(os.path.join(entries_dir, entry)) as file: data = json.load(file) if "dependencies" in data: for dep in data["dependencies"]: if dep in dependencies: dependencies[dep].append(shortname) else: dependencies[dep] = [shortname] if "topics" in data: for topic in data["topics"]: if topic in topics: topics[topic].append(shortname) else: topics[topic] = [shortname] for keyword in keywords.keys(): if keyword in data["abstract"].lower(): keywords[keyword].append(shortname) for keyword, values in list(keywords.items()): if len(values) > 10: keywords.pop(keyword) related_entries = {} for dataSet in [(keywords, 1), (dependencies, 1.5), (topics, 0.5)]: populate_related(dataSet[0], related_entries, dataSet[1]) for entry in related_entries: for keyword, value in list(related_entries[entry].items()): if value <= 2.5: related_entries[entry].pop(keyword) final_related = {} for keyword, values in related_entries.items(): final_related[keyword] = top_three(values) for entry, related in final_related.items(): if related: data = {"related": related} write_file(os.path.join(entries_dir, entry + ".md"), data) def populate_related(data, related, modifier=1): """This is a heavily nested loop to create the relatedEntries dictionary. For each of the categories, the list of entries associated with each key is iterated over twice and, if the entries are not the same, the modifier of that category is added to the relatedness score between the two entries in the dictionary. As the loop iterates twice over the value set, the resulting dictionary is bijective — i.e., the value for A->B will be equal to B->A. """ for _, entries in data.items(): for keyEntry in entries: for valueEntry in entries: if valueEntry != keyEntry: if keyEntry in related: if valueEntry in related[keyEntry]: related[keyEntry][valueEntry] += modifier else: related[keyEntry][valueEntry] = modifier else: related[keyEntry] = {valueEntry: modifier} def top_three(dictionary): """Returns the highest three dictionary keys by value""" return sorted(dictionary, key=dictionary.get, reverse=True)[:3] diff --git a/tools/afp_site_gen.scala b/tools/afp_site_gen.scala --- a/tools/afp_site_gen.scala +++ b/tools/afp_site_gen.scala @@ -1,222 +1,226 @@ package afp import isabelle._ object AFP_Site_Gen { /* json */ object JSON { type T = isabelle.JSON.T object Object { type T = isabelle.JSON.Object.T } def from_authors(authors: List[Metadata.Author]): T = Metadata.TOML.from_authors(authors) def from_topics(topics: List[Metadata.Topic]): T = Metadata.TOML.from_topics(topics) def from_affiliations(affiliations: List[Metadata.Affiliation]): Object.T = { Utils.group_sorted(affiliations, (a: Metadata.Affiliation) => a.author).view.mapValues(author_affiliations => { val homepage = author_affiliations.collectFirst { case Metadata.Homepage(_, _, url) => url } val email = author_affiliations.collectFirst { case Metadata.Email(_, _, address) => address } isabelle.JSON.Object( homepage.map(s => List("homepage" -> s)).getOrElse(Nil) ::: email.map(s => List("email" -> s)).getOrElse(Nil): _*) }).toMap } def from_change_history(history: Metadata.Change_History): Object.T = { if (history.isEmpty) { Map.empty } else { Map("Change history" -> history.map { case (date, str) => "[" + date.toString + "] " + str }.mkString("\n")) } } def from_entry(entry: Metadata.Entry): JSON.Object.T = isabelle.JSON.Object( "title" -> entry.title :: "authors" -> entry.authors.map(_.author).distinct :: "affiliations" -> from_affiliations(entry.authors ++ entry.contributors) :: (if (entry.contributors.nonEmpty) "contributors" -> entry.contributors.map(_.author).distinct :: Nil else Nil) ::: "date" -> entry.date.toString :: "topics" -> entry.topics.map(_.id) :: "abstract" -> entry.`abstract` :: "license" -> entry.license :: (if (entry.releases.nonEmpty) "releases" -> entry.releases.map(r => r.isabelle -> r.date.toString).toMap :: Nil else Nil) ::: (if (entry.note.nonEmpty) "note" -> entry.note :: Nil else Nil) ::: (if (entry.change_history.nonEmpty || entry.extra.nonEmpty) "extra" -> (from_change_history(entry.change_history) ++ entry.extra) :: Nil else Nil): _*) } /* site generation */ def afp_site_gen( out_dir: Option[Path], layout: Hugo.Layout, afp_structure: AFP_Structure, progress: Progress = new Progress()): Unit = { /* add authors */ progress.echo("Preparing authors...") val authors = afp_structure.load_authors val authors_by_id = Utils.grouped_sorted(authors, (a: Metadata.Author) => a.id) layout.write_data(Path.basic("authors.json"), JSON.from_authors(authors)) /* add topics */ progress.echo("Preparing topics...") val topics = afp_structure.load_topics def sub_topics(topic: Metadata.Topic): List[Metadata.Topic] = topic :: topic.sub_topics.flatMap(sub_topics) val topics_by_id = Utils.grouped_sorted(topics.flatMap(sub_topics), (t: Metadata.Topic) => t.id) layout.write_data(Path.basic("topics.json"), JSON.from_topics(topics)) /* add releases */ progress.echo("Preparing releases...") val releases_by_entry = afp_structure.load_releases.groupBy(_.entry) /* add entries and theory listings */ progress.echo("Preparing entries...") val sessions_structure = afp_structure.sessions_structure val sessions_deps = Sessions.deps(sessions_structure) for (name <- afp_structure.entries) { val entry = afp_structure.load_entry(name, authors_by_id, topics_by_id, releases_by_entry) val deps = for { session <- afp_structure.entry_sessions(name) dep <- sessions_structure.imports_graph.imm_preds(session.name) if session.name != dep && sessions_structure(dep).groups.contains("afp") } yield dep val topo_theories = for { session <- afp_structure.entry_sessions(name) base = sessions_deps(session.name) node <- base.session_theories } yield node.theory_base_name val entry_json = JSON.from_entry(entry) ++ isabelle.JSON.Object( "dependencies" -> deps.distinct, "theories" -> topo_theories, "aliases" -> List("/entries/" + name + ".html")) val theories_json = isabelle.JSON.Object( "url" -> ("/entries/" + name.toLowerCase + "/theories"), "theories" -> topo_theories) layout.write_content(Path.make(List("entries", name + ".md")), entry_json) layout.write_content(Path.make(List("theories", name + ".md")), theories_json) } - /* add related entries */ + /* add related entries and keywords */ progress.echo("Preparing related entries...") val entries_dir = layout.content_dir + Path.basic("entries") - val related_cmd = "from related import *; add_related(" + quote(entries_dir.implode) + ")" + val keywords_file = layout.static_dir + Path.make(List("data", "keywords.json")) + keywords_file.dir.file.mkdirs() + val related_cmd = "from related import *; add_related(" + + commas_quote(List(entries_dir.implode, keywords_file.implode)) + + ")" Python.run(related_cmd).check /* add statistics */ progress.echo("Preparing statistics...") val statistics_cmd = "from statistics import *; add_statistics(" + commas_quote( List( Path.explode("$AFP_BASE").absolute.implode, Path.explode("$AFP").absolute.implode, layout.data_dir.implode)) + ")" Python.run(statistics_cmd).check (layout.data_dir + Path.basic("statistics.html")).file.delete() - /* static */ + /* project */ - progress.echo("Preparing static files") + progress.echo("Preparing project files") - layout.write_static() + layout.copy_project() /* hugo */ out_dir match { case Some(out_dir) => progress.echo("Building site...") Hugo.build(layout, out_dir).check progress.echo("Finished building site") case None => progress.echo("Finished sitegen preparation.") } } val isabelle_tool = Isabelle_Tool("afp_site_gen", "generates afp website source", Scala_Project.here, args => { var base_dir = Path.explode("$AFP_BASE") var hugo_dir = base_dir + Path.make(List("web", "hugo")) var out_dir: Option[Path] = None val getopts = Getopts(""" Usage: isabelle afp_site_gen [OPTIONS] Options are: -B DIR afp base dir (default """" + base_dir.implode + """") -H DIR generated hugo project dir (default """" + hugo_dir.implode + """") -O DIR output dir (default none) Generates the AFP website source. HTML files of entries are dynamically loaded. Site will be built from generated source if output dir is specified. """, "B:" -> (arg => base_dir = Path.explode(arg)), "H:" -> (arg => hugo_dir = Path.explode(arg)), "O:" -> (arg => out_dir = Some(Path.explode(arg)))) getopts(args) val afp_structure = AFP_Structure(base_dir) val layout = Hugo.Layout(hugo_dir) val progress = new Console_Progress() progress.echo("Preparing site generation in " + hugo_dir.implode) afp_site_gen(out_dir = out_dir, layout = layout, afp_structure = afp_structure, progress = progress) }) } \ No newline at end of file diff --git a/tools/hugo.scala b/tools/hugo.scala --- a/tools/hugo.scala +++ b/tools/hugo.scala @@ -1,78 +1,80 @@ package afp import isabelle._ object Hugo { val hugo_home = Isabelle_System.getenv("ISABELLE_HUGO") val hugo_static = Path.explode("$AFP_BASE") + Path.make(List("admin", "site")) class Layout private(private[Hugo] val src_dir: Path) { private def write(file: Path, content: String): Unit = { val path = src_dir + file if (!path.dir.file.exists()) path.dir.file.mkdirs() File.write(path, content) } val data_dir = src_dir + Path.basic("data") def write_data(file: Path, content: JSON.T): Unit = write(Path.basic("data") + file, isabelle.JSON.Format(content)) val content_dir = src_dir + Path.basic("content") def write_content(file: Path, content: JSON.T): Unit = write(Path.basic("content") + file, isabelle.JSON.Format(content)) def write_asset(file: Path, content: String): Unit = write(Path.basic("assets") + file, content) - def write_static(): Unit = + val static_dir = src_dir + Path.basic("static") + + def copy_project(): Unit = { if (hugo_static.canonical != src_dir) { for { file <- List( List("content", "_index.md"), List("content", "about.md"), List("content", "contribution.md"), List("content", "download.md"), List("content", "help.md"), List("content", "search.md"), List("content", "statistics.md"), List("content", "submission.md"), List("themes"), List("theories"), List("config.json")) } Isabelle_System.copy_dir(hugo_static + Path.make(file), src_dir) } } } object Layout { def apply(src_dir: Path = Path.explode("$AFP_BASE") + Path.make(List("web", "hugo"))): Layout = new Layout(src_dir.canonical) } private lazy val exec = Path.explode(proper_string(hugo_home).getOrElse(error("No hugo component found"))) + Path.basic("hugo") def build(layout: Layout, out_dir: Path = Path.explode("$AFP_BASE") + Path.basic("web")): Process_Result = { Isabelle_System.bash( exec.implode + " -s " + quote(layout.src_dir.implode) + " -d " + quote(out_dir.canonical.implode)) } def watch(layout: Layout, out_dir: Path = Path.explode("$AFP_BASE") + Path.basic("web"), progress: Progress = new Progress()): Process_Result = { Isabelle_System.bash( exec.implode + " server -s " + quote(layout.src_dir.implode) + " -d " + quote(out_dir.canonical.implode), progress_stdout = progress.echo, progress_stderr = progress.echo_warning) } } \ No newline at end of file