diff --git a/admin/components/afp b/admin/components/afp --- a/admin/components/afp +++ b/admin/components/afp @@ -1,4 +1,3 @@ # components required for the AFP infrastructure -python-3.10.4 hugo-0.88.1 ci-extras-1 diff --git a/admin/sitegen-lib/keywords.py b/admin/sitegen-lib/keywords.py deleted file mode 100644 --- a/admin/sitegen-lib/keywords.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Generates a list of keywords for the search autocomplete using RAKE.""" -import RAKE - - -def print_keywords(text): - rake = RAKE.Rake(RAKE.SmartStopList()) - res = rake.run(text, minCharacters=3, maxWords=2, minFrequency=1) - for keyword in res: - print(keyword) diff --git a/admin/sitegen-req.txt b/admin/sitegen-req.txt deleted file mode 100644 --- a/admin/sitegen-req.txt +++ /dev/null @@ -1,10 +0,0 @@ -Jinja2==3.1.2 -termcolor==1.1.0 -pytz==2018.5 -six==1.11.0 -tqdm==4.62.3 -unidecode==1.3.2 -bs4==0.0.1 -requests==2.26.0 -lxml==4.6.3 -python-rake==1.5.0 diff --git a/etc/build.props b/etc/build.props --- a/etc/build.props +++ b/etc/build.props @@ -1,25 +1,26 @@ title = AFP/Tools module = $AFP_BASE/tools/lib/classes/afp_tools.jar requirements = \ env:ISABELLE_SCALA_JAR \ $ISABELLE_HOME_USER/contrib/ci-extras-1/lib/ci-extras.jar sources = \ tools/admin/afp_build_hugo.scala \ tools/admin/afp_build_python.scala \ tools/migration/afp_migrate_metadata.scala \ tools/afp_build.scala \ tools/afp_check_roots.scala \ tools/afp_dependencies.scala \ tools/afp_site_gen.scala \ tools/afp_structure.scala \ tools/afp_tool.scala \ tools/hugo.scala \ tools/metadata.scala \ - tools/python.scala \ + tools/rake.scala \ tools/toml.scala \ tools/utils.scala resources = \ + tools/SmartStoplist.txt:SmartStoplist.txt \ tools/migration/public_suffix_list.dat:public_suffix_list.dat services = \ afp.Tools \ afp.Build_Tools diff --git a/tools/SmartStoplist.txt b/tools/SmartStoplist.txt new file mode 100644 --- /dev/null +++ b/tools/SmartStoplist.txt @@ -0,0 +1,572 @@ +#stop word list from SMART (Salton,1971). Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop +a +a's +able +about +above +according +accordingly +across +actually +after +afterwards +again +against +ain't +all +allow +allows +almost +alone +along +already +also +although +always +am +among +amongst +an +and +another +any +anybody +anyhow +anyone +anything +anyway +anyways +anywhere +apart +appear +appreciate +appropriate +are +aren't +around +as +aside +ask +asking +associated +at +available +away +awfully +b +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +believe +below +beside +besides +best +better +between +beyond +both +brief +but +by +c +c'mon +c's +came +can +can't +cannot +cant +cause +causes +certain +certainly +changes +clearly +co +com +come +comes +concerning +consequently +consider +considering +contain +containing +contains +corresponding +could +couldn't +course +currently +d +definitely +described +despite +did +didn't +different +do +does +doesn't +doing +don't +done +down +downwards +during +e +each +edu +eg +eight +either +else +elsewhere +enough +entirely +especially +et +etc +even +ever +every +everybody +everyone +everything +everywhere +ex +exactly +example +except +f +far +few +fifth +first +five +followed +following +follows +for +former +formerly +forth +four +from +further +furthermore +g +get +gets +getting +given +gives +go +goes +going +gone +got +gotten +greetings +h +had +hadn't +happens +hardly +has +hasn't +have +haven't +having +he +he's +hello +help +hence +her +here +here's +hereafter +hereby +herein +hereupon +hers +herself +hi +him +himself +his +hither +hopefully +how +howbeit +however +i +i'd +i'll +i'm +i've +ie +if +ignored +immediate +in +inasmuch +inc +indeed +indicate +indicated +indicates +inner +insofar +instead +into +inward +is +isn't +it +it'd +it'll +it's +its +itself +j +just +k +keep +keeps +kept +know +knows +known +l +last +lately +later +latter +latterly +least +less +lest +let +let's +like +liked +likely +little +look +looking +looks +ltd +m +mainly +many +may +maybe +me +mean +meanwhile +merely +might +more +moreover +most +mostly +much +must +my +myself +n +name +namely +nd +near +nearly +necessary +need +needs +neither +never +nevertheless +new +next +nine +no +nobody +non +none +noone +nor +normally +not +nothing +novel +now +nowhere +o +obviously +of +off +often +oh +ok +okay +old +on +once +one +ones +only +onto +or +other +others +otherwise +ought +our +ours +ourselves +out +outside +over +overall +own +p +particular +particularly +per +perhaps +placed +please +plus +possible +presumably +probably +provides +q +que +quite +qv +r +rather +rd +re +really +reasonably +regarding +regardless +regards +relatively +respectively +right +s +said +same +saw +say +saying +says +second +secondly +see +seeing +seem +seemed +seeming +seems +seen +self +selves +sensible +sent +serious +seriously +seven +several +shall +she +should +shouldn't +since +six +so +some +somebody +somehow +someone +something +sometime +sometimes +somewhat +somewhere +soon +sorry +specified +specify +specifying +still +sub +such +sup +sure +t +t's +take +taken +tell +tends +th +than +thank +thanks +thanx +that +that's +thats +the +their +theirs +them +themselves +then +thence +there +there's +thereafter +thereby +therefore +therein +theres +thereupon +these +they +they'd +they'll +they're +they've +think +third +this +thorough +thoroughly +those +though +three +through +throughout +thru +thus +to +together +too +took +toward +towards +tried +tries +truly +try +trying +twice +two +u +un +under +unfortunately +unless +unlikely +until +unto +up +upon +us +use +used +useful +uses +using +usually +uucp +v +value +various +very +via +viz +vs +w +want +wants +was +wasn't +way +we +we'd +we'll +we're +we've +welcome +well +went +were +weren't +what +what's +whatever +when +whence +whenever +where +where's +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +who's +whoever +whole +whom +whose +why +will +willing +wish +with +within +without +won't +wonder +would +would +wouldn't +x +y +yes +yet +you +you'd +you'll +you're +you've +your +yours +yourself +yourselves +z +zero diff --git a/tools/afp_site_gen.scala b/tools/afp_site_gen.scala --- a/tools/afp_site_gen.scala +++ b/tools/afp_site_gen.scala @@ -1,408 +1,385 @@ /* Author: Fabian Huch, TU Muenchen Generation and compilation of SSG project for the AFP website. */ package afp import isabelle._ import afp.Metadata.{Affiliation, Author, Email, Entry, Homepage} object AFP_Site_Gen { /* json */ object JSON { type T = isabelle.JSON.T object Object { type T = isabelle.JSON.Object.T } def from_authors(authors: List[Metadata.Author]): T = { authors.map(author => author.id -> isabelle.JSON.Object( "name" -> author.name, "emails" -> author.emails.map(_.address), "homepages" -> author.homepages.map(_.url))).toMap } def from_topics(topics: List[Metadata.Topic]): T = topics.map(topic => isabelle.JSON.Object(topic.name -> from_topics(topic.sub_topics))) def from_affiliations(affiliations: List[Metadata.Affiliation]): Object.T = { Utils.group_sorted(affiliations, (a: Metadata.Affiliation) => a.author).view.mapValues( { author_affiliations => val homepage = author_affiliations.collectFirst { case Metadata.Homepage(_, _, url) => url } val email = author_affiliations.collectFirst { case Metadata.Email(_, _, address) => address } isabelle.JSON.Object( homepage.map(s => List("homepage" -> s)).getOrElse(Nil) ::: email.map(s => List("email" -> s)).getOrElse(Nil): _*) }).toMap } def from_change_history(history: Metadata.Change_History): Object.T = { if (history.isEmpty) { Map.empty } else { Map("Change history" -> history.map { case (date, str) => "[" + date.toString + "] " + str }.mkString("\n")) } } def from_entry(entry: Metadata.Entry): JSON.Object.T = isabelle.JSON.Object( "title" -> entry.title :: "authors" -> entry.authors.map(_.author).distinct :: "affiliations" -> from_affiliations(entry.authors ++ entry.contributors) :: (if (entry.contributors.nonEmpty) "contributors" -> entry.contributors.map(_.author).distinct :: Nil else Nil) ::: "date" -> entry.date.toString :: "topics" -> entry.topics.map(_.id) :: "abstract" -> entry.`abstract` :: "license" -> entry.license.name :: (if (entry.releases.nonEmpty) "releases" -> entry.releases.map(r => r.isabelle -> r.date.toString).toMap :: Nil else Nil) ::: (if (entry.note.nonEmpty) "note" -> entry.note :: Nil else Nil) ::: (if (entry.change_history.nonEmpty || entry.extra.nonEmpty) "extra" -> (from_change_history(entry.change_history) ++ entry.extra) :: Nil else Nil): _*) def from_keywords(keywords: List[String]): T = keywords.zipWithIndex.map { case (keyword, i) => isabelle.JSON.Object("id" -> i, "keyword" -> keyword) } } - /* keyword extraction */ - - private val replacements = List( - "<[^>]*>".r -> "", - "[^\\w\\s()'.,;:-]".r -> " ", - "\\s+".r -> " ") - - def extract_keywords(text: String): List[String] = { - val stripped_text = - replacements.foldLeft(text) { case (res, (regex, replacement)) => regex.replaceAllIn(res, replacement) } - - val arg = quote(stripped_text.replaceAll("\"", "\\\"")) - - val keyword_cmd = "from keywords import *; print_keywords(" + arg + ")" - - Python.run(keyword_cmd).check.out_lines - } - - /* stats */ def afp_stats(deps: Sessions.Deps, structure: AFP_Structure, entries: List[Entry], progress: Progress): JSON.T = { def round(int: Int): Int = Math.round(int.toFloat / 100) * 100 def nodes(entry: Entry): List[Document.Node.Name] = structure.entry_sessions(entry.name).flatMap(session => deps(session.name).session_theories) var entry_lines = Map.empty[Entry, Int] var entry_lemmas = Map.empty[Entry, Int] for { entry <- entries node <- nodes(entry) lines = split_lines(File.read(node.path)).map(_.trim) } { entry_lines += entry -> (entry_lines.getOrElse(entry, 0) + lines.count(_.nonEmpty)) entry_lemmas += entry -> (entry_lemmas.getOrElse(entry, 0) + lines.count(line => List("lemma", "theorem", "corollary").exists(line.startsWith))) } val first_year = entries.flatMap(_.releases).map(_.date.getYear).min def years(upto: Int): List[Int] = Range.inclusive(first_year, upto).toList val current_year = Date.now().rep.getYear val all_years = years(current_year) // per Isabelle release year val by_year = entries.groupBy(_.date.getYear) val size_by_year = by_year.view.mapValues(_.length).toMap val loc_by_year = by_year.view.mapValues(_.map(entry_lines).sum).toMap val authors_by_year = by_year.view.mapValues(_.flatMap(_.authors).map(_.author)).toMap val num_lemmas = entries.map(entry_lemmas).sum val num_lines = entries.map(entry_lines).sum // accumulated def total_articles(year: Int): Int = years(year).map(size_by_year.getOrElse(_, 0)).sum def total_loc(year: Int): Int = round(years(year).map(loc_by_year.getOrElse(_, 0)).sum) def total_authors(year: Int): Int = years(year).flatMap(authors_by_year.getOrElse(_, Nil)).distinct.length def fresh_authors(year: Int): Int = total_authors(year) - total_authors(year - 1) val sorted = entries.sortBy(_.date) isabelle.JSON.Object( "num_lemmas" -> num_lemmas, "num_loc" -> num_lines, "articles_year" -> all_years.map(total_articles), "loc_years" -> all_years.map(total_loc), "author_years" -> all_years.map(fresh_authors), "author_years_cumulative" -> all_years.map(total_authors), "loc_articles" -> sorted.map(entry_lines), "all_articles" -> sorted.map(_.name)) } /* site generation */ def afp_site_gen( layout: Hugo.Layout, status_file: Option[Path], afp_structure: AFP_Structure, clean: Boolean = false, progress: Progress = new Progress() ): Unit = { /* clean old */ if (clean) { progress.echo("Cleaning up generated files...") layout.clean() } /* add topics */ progress.echo("Preparing topics...") val topics = afp_structure.load_topics def sub_topics(topic: Metadata.Topic): List[Metadata.Topic] = topic :: topic.sub_topics.flatMap(sub_topics) val topics_by_id = Utils.grouped_sorted(topics.flatMap(sub_topics), (t: Metadata.Topic) => t.id) layout.write_data(Path.basic("topics.json"), JSON.from_topics(topics)) /* add licenses */ progress.echo("Preparing licenses...") val licenses_by_id = Utils.grouped_sorted(afp_structure.load_licenses, (l: Metadata.License) => l.id) /* add releases */ progress.echo("Preparing releases...") val releases_by_entry = afp_structure.load_releases.groupBy(_.entry) /* prepare authors and entries */ progress.echo("Preparing authors...") val full_authors = afp_structure.load_authors val authors_by_id = Utils.grouped_sorted(full_authors, (a: Metadata.Author) => a.id) var seen_affiliations = Set.empty[Affiliation] val entries = afp_structure.entries.flatMap { name => val entry = afp_structure.load_entry(name, authors_by_id, topics_by_id, licenses_by_id, releases_by_entry) if (entry.sitegen_ignore) None else { seen_affiliations ++= entry.authors ++ entry.contributors Some(entry) } } val authors = Utils.group_sorted(seen_affiliations.toList, (a: Affiliation) => a.author).map { case (id, affiliations) => val emails = affiliations.collect { case e: Email => e } val homepages = affiliations.collect { case h: Homepage => h } Author(id, authors_by_id(id).name, emails, homepages) } layout.write_data(Path.basic("authors.json"), JSON.from_authors(authors.toList)) /* extract keywords */ progress.echo("Extracting keywords...") var seen_keywords = Set.empty[String] val entry_keywords = entries.map { entry => - val Keyword = """\('([^']*)', ([^)]*)\)""".r - val scored_keywords = extract_keywords(entry.`abstract`).map { - case Keyword(keyword, score) => keyword -> score.toDouble - case s => error("Could not parse: " + s) - } + val scored_keywords = Rake.extract_keywords(entry.`abstract`) seen_keywords ++= scored_keywords.map(_._1) entry.name -> scored_keywords.filter(_._2 > 1.0).map(_._1) }.toMap seen_keywords = seen_keywords.filter(k => !k.endsWith("s") || !seen_keywords.contains(k.stripSuffix("s"))) layout.write_static(Path.make(List("data", "keywords.json")), JSON.from_keywords(seen_keywords.toList)) def get_keywords(name: Metadata.Entry.Name): List[String] = entry_keywords(name).filter(seen_keywords.contains).take(8) /* add entries and theory listings */ progress.echo("Preparing entries...") val sessions_structure = afp_structure.sessions_structure val sessions_deps = Sessions.deps(sessions_structure) entries.foreach { entry => val deps = for { session <- afp_structure.entry_sessions(entry.name) dep <- sessions_structure.imports_graph.imm_preds(session.name) if session.name != dep && sessions_structure(dep).groups.contains("AFP") } yield dep val theories = afp_structure.entry_sessions(entry.name).map { session => val base = sessions_deps(session.name) val theories = base.session_theories.map(_.theory_base_name) val session_json = isabelle.JSON.Object( "title" -> session.name, "entry" -> entry.name, "url" -> ("/theories/" + session.name.toLowerCase), "theories" -> theories) layout.write_content(Path.make(List("theories", session.name + ".md")), session_json) isabelle.JSON.Object("session" -> session.name, "theories" -> theories) } val entry_json = JSON.from_entry(entry) ++ isabelle.JSON.Object( "dependencies" -> deps.distinct, "sessions" -> theories, "url" -> ("/entries/" + entry.name + ".html"), "keywords" -> get_keywords(entry.name)) layout.write_content(Path.make(List("entries", entry.name + ".md")), entry_json) } /* add authors */ /* add statistics */ progress.echo("Preparing statistics...") val statistics_json = afp_stats(sessions_deps, afp_structure, entries, progress) layout.write_data(Path.basic("statistics.json"), statistics_json) /* project */ progress.echo("Preparing project files") layout.copy_project() /* status */ status_file match { case Some(status_file) => progress.echo("Preparing devel version...") val status_json = isabelle.JSON.parse(File.read(status_file)) layout.write_data(Path.basic("status.json"), status_json) case None => } progress.echo("Finished sitegen preparation.") } /* build site */ def afp_build_site( out_dir: Path, layout: Hugo.Layout, do_watch: Boolean = false, clean: Boolean = false, progress: Progress = new Progress() ): Unit = { if (clean) { progress.echo("Cleaning output dir...") Hugo.clean(out_dir) } if (do_watch) { Hugo.watch(layout, out_dir, progress).check } else { progress.echo("Building site...") Hugo.build(layout, out_dir).check progress.echo("Build in " + (out_dir + Path.basic("index.html")).absolute.implode) } } /* tool wrapper */ val isabelle_tool = Isabelle_Tool("afp_site_gen", "generates afp website source", Scala_Project.here, { args => var base_dir = Path.explode("$AFP_BASE") var status_file: Option[Path] = None var hugo_dir = base_dir + Path.make(List("web", "hugo")) var out_dir: Path = base_dir + Path.make(List("web", "out")) var build_only = false var devel_mode = false var fresh = false val getopts = Getopts(""" Usage: isabelle afp_site_gen [OPTIONS] Options are: -B DIR afp base dir (default """" + base_dir.implode + """") -D FILE build status file for devel version -H DIR generated hugo project dir (default """" + hugo_dir.implode + """") -O DIR output dir for build (default """ + out_dir.implode + """) -b build only -d devel mode (overrides hugo dir, builds site in watch mode) -f fresh build: clean up existing hugo and build directories Generates the AFP website source. HTML files of entries are dynamically loaded. Providing a status file will build the development version of the archive. Site will be built from generated source if output dir is specified. """, "B:" -> (arg => base_dir = Path.explode(arg)), "D:" -> (arg => status_file = Some(Path.explode(arg))), "H:" -> (arg => hugo_dir = Path.explode(arg)), "O:" -> (arg => out_dir = Path.explode(arg)), "b" -> (_ => build_only = true), "d" -> (_ => devel_mode = true), "f" -> (_ => fresh = true)) getopts(args) status_file.foreach(path => if (!path.is_file || !path.file.exists()) error("Invalid status file: " + path)) if (devel_mode) hugo_dir = base_dir + Path.make(List("admin", "site")) val afp_structure = AFP_Structure(base_dir) val layout = Hugo.Layout(hugo_dir) val progress = new Console_Progress() if (!build_only) { progress.echo("Preparing site generation in " + hugo_dir.implode) afp_site_gen(layout = layout, status_file = status_file, afp_structure = afp_structure, clean = fresh, progress = progress) } afp_build_site(out_dir = out_dir, layout = layout, do_watch = devel_mode, clean = fresh, progress = progress) }) } \ No newline at end of file diff --git a/tools/python.scala b/tools/python.scala deleted file mode 100644 --- a/tools/python.scala +++ /dev/null @@ -1,30 +0,0 @@ -/* Author: Fabian Huch, TU Muenchen - -Wrapper for Python interaction, to be used with the AFP Python component. - */ -package afp - - -import isabelle._ - - -object Python { - val python_home = Isabelle_System.getenv("ISABELLE_PYTHON") - - val python_path = (Path.explode("$AFP_BASE") + Path.make(List("admin", "sitegen-lib"))).absolute - - def run(script: String): Process_Result = { - val exec = - Path.explode(proper_string(python_home).getOrElse("No python component found")) + - Path.make(List("bin", "python3")) - - val script_file = Isabelle_System.tmp_file("isabelle_python", "py") - File.write(script_file, script) - - val res = Isabelle_System.bash(exec.implode + " " + script_file.getAbsolutePath, - env = Isabelle_System.settings(List("PYTHONPATH" -> python_path.implode))) - - script_file.delete() - res - } -} diff --git a/tools/rake.scala b/tools/rake.scala new file mode 100644 --- /dev/null +++ b/tools/rake.scala @@ -0,0 +1,80 @@ +/* Author: Fabian Huch, TU Muenchen + +Rapid keyword extraction algorithm. From: +Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic keyword extraction from individual +documents. Text mining: applications and theory, 1(1-20), 10-1002. + */ +package afp + + +import isabelle._ + +import scala.util.matching.Regex + +import java.io.{BufferedReader, InputStreamReader} + + +object Rake { + private val max_words = 2 + private val min_chars = 3 + + private val number = "^[0-9]+$".r + private val replacements = List( + "<[^>]*>".r -> "", + "[^\\w\\s()'.,;:-]".r -> " ", + "\\s+".r -> " ") + + private lazy val stop_words: Regex = { + val stream = getClass.getClassLoader.getResourceAsStream("SmartStoplist.txt") + val reader = new BufferedReader(new InputStreamReader(stream)) + val stop_words = File.read_lines(reader, _ => ()).filterNot(_.startsWith("#")).map(_.strip) + val regex = "\\b(?i)(" + stop_words.map(l => l + "(?![\\w-])").mkString("|") + ")" + regex.r + } + + val sentence_delimiters = "[.!?,;:\t\\\\\"()'\u2019\u2013]|\\s-\\s".r + val word_delimiters = "[^a-zA-Z0-9_+-/]".r + + def separate_words(text: String): List[String] = { + for { + word_raw <- word_delimiters.split(text).toList + word = word_raw.strip.toLowerCase + if word.nonEmpty && word.length >= min_chars && !number.matches(word) + } yield word + } + + def calculate_word_scores(phrases: List[String]): Map[String, Double] = { + def count(words: List[String], freq: Int, deg: Int): Option[(Int, Int)] = + Some((freq + 1, deg + words.length - 1)) + + var counts = Map.empty[String, (Int, Int)] + for { + phrase <- phrases + words = separate_words(phrase) + word <- words + } counts = counts.updatedWith(word) { + case Some((freq, deg)) => count(words, freq, deg) + case None => count(words, 0, 0) + } + counts.view.mapValues { case (freq, deg) => (deg + freq).toDouble / freq }.toMap + } + + def extract_keywords(text: String): List[(String, Double)] = { + val stripped_text = replacements.foldLeft(text) { + case (res, (regex, replacement)) => regex.replaceAllIn(res, replacement) + } + + val phrases = for { + sentence <- sentence_delimiters.split(stripped_text) + phrase <- stop_words.split(sentence) + if !phrase.isBlank + } yield phrase.strip().toLowerCase + + val word_scores = calculate_word_scores(phrases.toList) + + val keywords = phrases.filter(_.split(' ').length <= max_words).map(phrase => + phrase -> separate_words(phrase).map(word_scores.getOrElse(_, 0.0)).sum) + + keywords.sortBy(_._2).toList.reverse + } +} \ No newline at end of file