From d32caaf8d258eaa1f36c8d8b3b63c8fe2012b2e4 Mon Sep 17 00:00:00 2001
From: Dominique Hazael-Massieux <dom@w3.org>
Date: Thu, 1 Sep 2022 17:20:11 +0200
Subject: [PATCH 1/5] Restructure links extracts to keep original links

Currently, reffy "canonicalize" urls before recording them in the links extracts.
While that canonicalization is useful to regroup links that points to the same spec, it loses critical information (e.g. for links to multipage specs) and produces information that does not really reflect the reality of the spec, making it hard to produce reliable analysis.

The patch instead records the original link untouched, and annotates it with the spec that can be identified after canonicalization.

This means the structure of links extract will change.
---
 src/browserlib/extract-links.mjs | 34 +++++++++++++++++++++-----------
 src/lib/specs-crawler.js         |  8 ++++----
 tests/crawl-test.json            | 20 +++++++++++--------
 3 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/src/browserlib/extract-links.mjs b/src/browserlib/extract-links.mjs
index 1d50753a..221dd8d4 100644
--- a/src/browserlib/extract-links.mjs
+++ b/src/browserlib/extract-links.mjs
@@ -2,27 +2,39 @@ import { canonicalizeUrl } from './canonicalize-url.mjs';
 
 /**
  * Extract and canonicalize absolute links of the document and their fragments
- * FIXME: ⚠ Modify the DOM
 */
-export default function () {
-  // Ignore links from the "head" section, which either link to
-  // self, the GitHub repo, the implementation report, and other
-  // documents that don't need to appear in the list of references.
+export default function (spec, _, specs) {
   const links = {};
-  [...document.querySelectorAll('.head a[href]')].forEach(n => n.href = '');
   document.querySelectorAll('a[href^=http]').forEach(n => {
-    const url = canonicalizeUrl(n.href);
-    if (!links[url]) {
-      links[url] = new Set();
+    // Ignore links from the "head" section, which either link to
+    // self, the GitHub repo, the implementation report, and other
+    // documents that don't need to appear in the list of references.
+    if (n.closest('.head')) return;
+    const pageUrl = n.href.split('#')[0];
+    if (!links[pageUrl]) {
+      links[pageUrl] = {anchors: new Set()};
     }
     if (n.href.includes('#') && n.href.split('#')[1]) {
-      links[url].add(n.href.split('#')[1]);
+      links[pageUrl].anchors.add(n.href.split('#')[1]);
+    }
+
+    // Annotate with the spec to which the page belong if we can find one
+    if (Array.isArray(specs)) {
+      const specUrl = canonicalizeUrl(pageUrl);
+      let matchingSpec = specs.find(s => s?.release?.url === specUrl || s?.nightly?.url === specUrl || s?.series?.nightlyUrl === specUrl || s?.series?.releaseUrl === specUrl || s?.nightly?.pages?.includes(specUrl) || s?.release?.pages?.includes(specUrl));
+      if (matchingSpec) {
+	links[pageUrl].specShortname = matchingSpec.shortname;
+      }
     }
   });
   return Object.keys(links)
+    .sort()
   // turning sets into arrays
     .reduce((acc, u) => {
-      acc[u] = [...links[u]];
+      acc[u] = {specShortname: links[u].specShortname};
+      if (links[u].anchors.size > 0) {
+	acc[u].anchors = [...links[u].anchors];
+      }
       return acc;
   }, {});
 }
diff --git a/src/lib/specs-crawler.js b/src/lib/specs-crawler.js
index 09b78a20..f3864124 100644
--- a/src/lib/specs-crawler.js
+++ b/src/lib/specs-crawler.js
@@ -91,18 +91,18 @@ async function crawlSpec(spec, crawlOptions) {
         else {
             result = await processSpecification(
                 urlToCrawl,
-                (spec, modules) => {
+              (spec, modules, speclist) => {
                     const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
                         window.reffy.mapIdsToHeadings() : null;
                     const res = {
                         crawled: window.location.toString()
                     };
                     modules.forEach(mod => {
-                        res[mod.property] = window.reffy[mod.name](spec, idToHeading);
+                        res[mod.property] = window.reffy[mod.name](spec, idToHeading, speclist);
                     });
                     return res;
                 },
-                [spec, crawlOptions.modules],
+              [spec, crawlOptions.modules, crawlOptions.speclist],
                 { quiet: crawlOptions.quiet,
                   forceLocalFetch: crawlOptions.forceLocalFetch,
                   ...cacheInfo}
@@ -294,7 +294,7 @@ async function saveSpecResults(spec, settings) {
 async function crawlList(speclist, crawlOptions) {
     // Make a shallow copy of crawl options object since we're going
     // to modify properties in place
-    crawlOptions = Object.assign({}, crawlOptions);
+    crawlOptions = Object.assign({speclist}, crawlOptions);
 
     // Expand list of processing modules to use if not already done
     crawlOptions.modules = expandBrowserModules(crawlOptions.modules);
diff --git a/tests/crawl-test.json b/tests/crawl-test.json
index 0846b21a..378acc0c 100644
--- a/tests/crawl-test.json
+++ b/tests/crawl-test.json
@@ -13,9 +13,11 @@
     ],
     "crawled": "https://w3c.github.io/woff/woff2/",
     "links": {
-      "https://www.w3.org/TR/bar/": [
-        "baz"
-      ]
+      "https://www.w3.org/TR/bar/": {
+	"anchors": [
+          "baz"
+	]
+      }
     },
     "title": "WOFF2",
     "css": {
@@ -64,10 +66,12 @@
     ],
     "crawled": "https://w3c.github.io/mediacapture-output/",
     "links": {
-      "https://webidl.spec.whatwg.org/": [
-        "Exposed",
-        "idl-DOMString"
-      ]
+      "https://webidl.spec.whatwg.org/": {
+	"anchors": [
+          "Exposed",
+          "idl-DOMString"
+	]
+      }
     },
     "refs": {
       "normative": [],
@@ -188,7 +192,7 @@
     ],
     "crawled": "https://w3c.github.io/accelerometer/",
     "links": {
-      "https://www.w3.org/TR/Foo/": []
+      "https://www.w3.org/TR/Foo": {}
     },
     "refs": {
       "normative": [

From 84dda39da3ad7f36b1082616852220cbc7e9cdbd Mon Sep 17 00:00:00 2001
From: Dominique Hazael-Massieux <dom@w3.org>
Date: Fri, 2 Sep 2022 10:28:57 +0200
Subject: [PATCH 2/5] Associate link with proper version when matching
 version-less series

---
 src/browserlib/extract-links.mjs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/browserlib/extract-links.mjs b/src/browserlib/extract-links.mjs
index 221dd8d4..fe614e79 100644
--- a/src/browserlib/extract-links.mjs
+++ b/src/browserlib/extract-links.mjs
@@ -21,7 +21,7 @@ export default function (spec, _, specs) {
     // Annotate with the spec to which the page belong if we can find one
     if (Array.isArray(specs)) {
       const specUrl = canonicalizeUrl(pageUrl);
-      let matchingSpec = specs.find(s => s?.release?.url === specUrl || s?.nightly?.url === specUrl || s?.series?.nightlyUrl === specUrl || s?.series?.releaseUrl === specUrl || s?.nightly?.pages?.includes(specUrl) || s?.release?.pages?.includes(specUrl));
+      let matchingSpec = specs.find(s => s?.release?.url === specUrl || s?.nightly?.url === specUrl || (s?.series?.currentSpecification === s?.shortname && (s?.series?.nightlyUrl === specUrl || s?.series?.releaseUrl === specUrl)) || s?.nightly?.pages?.includes(specUrl) || s?.release?.pages?.includes(specUrl));
       if (matchingSpec) {
 	links[pageUrl].specShortname = matchingSpec.shortname;
       }

From 90e11837110123fc96207f77b696e765810c8575 Mon Sep 17 00:00:00 2001
From: Dominique Hazael-Massieux <dom@w3.org>
Date: Fri, 2 Sep 2022 17:04:20 +0200
Subject: [PATCH 3/5] Separate annotation of links with spec destination into
 postProcessing

As suggested in https://github.com/w3c/reffy/pull/1055#discussion_r961460419
---
 src/browserlib/canonicalize-url.mjs  | 51 -------------------
 src/browserlib/extract-links.mjs     | 17 ++-----
 src/lib/post-processor.js            |  5 +-
 src/lib/specs-crawler.js             |  8 +--
 src/postprocessing/annotate-links.js | 74 ++++++++++++++++++++++++++++
 5 files changed, 84 insertions(+), 71 deletions(-)
 delete mode 100644 src/browserlib/canonicalize-url.mjs
 create mode 100644 src/postprocessing/annotate-links.js

diff --git a/src/browserlib/canonicalize-url.mjs b/src/browserlib/canonicalize-url.mjs
deleted file mode 100644
index f7637373..00000000
--- a/src/browserlib/canonicalize-url.mjs
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Return a canonicalized version of the given URL.
- *
- * By default, the canonicalized URL should represent the same resource and
- * typically de-reference to the same document (or a subpage of it).
- *
- * Canonicalization can be made a bit stronger through options, in particular
- * to canonicalize dated W3C URLs to the Latest version, and to use a list of
- * equivalent URLs (that the crawler typically generates).
- */
-export function canonicalizeUrl(url, options) {
-    options = options || {};
-
-    let canon = url.replace(/^http:/, 'https:')
-            .split('#')[0]
-            .replace('index.html', '')
-            .replace('Overview.html', '')
-            .replace('cover.html', '')
-            .replace(/spec.whatwg.org\/.*/, 'spec.whatwg.org/')  // subpage to main document in whatwg
-            .replace(/w3.org\/TR\/(([^\/]+\/)+)[^\/]+\.[^\/]+$/, 'w3.org/TR/$1') // subpage to main document in w3c
-            .replace(/w3.org\/TR\/([^\/]+)$/, 'w3.org/TR/$1/') // enforce trailing slash
-            .replace(/w3c.github.io\/([^\/]+)$/, 'w3c.github.io/$1/') // enforce trailing slash for ED on GitHub
-        ;
-
-    if (options.datedToLatest) {
-        canon = canon.replace(
-            /w3.org\/TR\/[0-9]{4}\/[A-Z]+-(.*)-[0-9]{8}\/?/,
-            'w3.org/TR/$1/');
-    }
-
-    let equivalentUrls = (options.equivalents) ? options.equivalents[canon] : null;
-    if (Array.isArray(equivalentUrls)) {
-        return (options.returnAlternatives ? equivalentUrls : equivalentUrls[0]);
-    }
-    else {
-        return (equivalentUrls ? equivalentUrls : canon);
-    }
-}
-
-
-export function canonicalizesTo(url, refUrl, options) {
-    let newOptions = {
-        datedToLatest: (options ? options.datedToLatest : false),
-        equivalents: (options ? options.equivalents : null),
-        returnAlternatives: true
-    };
-    let canon = canonicalizeUrl(url, newOptions);
-    return Array.isArray(refUrl) ?
-        refUrl.some(u => canon.includes(u)) :
-        canon.includes(refUrl);
-}
\ No newline at end of file
diff --git a/src/browserlib/extract-links.mjs b/src/browserlib/extract-links.mjs
index fe614e79..919834e3 100644
--- a/src/browserlib/extract-links.mjs
+++ b/src/browserlib/extract-links.mjs
@@ -1,9 +1,7 @@
-import { canonicalizeUrl } from './canonicalize-url.mjs';
-
 /**
- * Extract and canonicalize absolute links of the document and their fragments
+ * Extract absolute links of the document and their fragments
 */
-export default function (spec, _, specs) {
+export default function () {
   const links = {};
   document.querySelectorAll('a[href^=http]').forEach(n => {
     // Ignore links from the "head" section, which either link to
@@ -17,21 +15,12 @@ export default function (spec, _, specs) {
     if (n.href.includes('#') && n.href.split('#')[1]) {
       links[pageUrl].anchors.add(n.href.split('#')[1]);
     }
-
-    // Annotate with the spec to which the page belong if we can find one
-    if (Array.isArray(specs)) {
-      const specUrl = canonicalizeUrl(pageUrl);
-      let matchingSpec = specs.find(s => s?.release?.url === specUrl || s?.nightly?.url === specUrl || (s?.series?.currentSpecification === s?.shortname && (s?.series?.nightlyUrl === specUrl || s?.series?.releaseUrl === specUrl)) || s?.nightly?.pages?.includes(specUrl) || s?.release?.pages?.includes(specUrl));
-      if (matchingSpec) {
-	links[pageUrl].specShortname = matchingSpec.shortname;
-      }
-    }
   });
   return Object.keys(links)
     .sort()
   // turning sets into arrays
     .reduce((acc, u) => {
-      acc[u] = {specShortname: links[u].specShortname};
+      acc[u] = {};
       if (links[u].anchors.size > 0) {
 	acc[u].anchors = [...links[u].anchors];
       }
diff --git a/src/lib/post-processor.js b/src/lib/post-processor.js
index e13b5098..0c518207 100644
--- a/src/lib/post-processor.js
+++ b/src/lib/post-processor.js
@@ -59,7 +59,8 @@ const modules = {
   csscomplete: require('../postprocessing/csscomplete'),
   events: require('../postprocessing/events'),
   idlnames: require('../postprocessing/idlnames'),
-  idlparsed: require('../postprocessing/idlparsed')
+  idlparsed: require('../postprocessing/idlparsed'),
+  annotatelinks: require('../postprocessing/annotate-links')
 };
 
 
@@ -266,4 +267,4 @@ module.exports = {
   extractsPerSeries,
   dependsOn,
   appliesAtLevel
-};
\ No newline at end of file
+};
diff --git a/src/lib/specs-crawler.js b/src/lib/specs-crawler.js
index f3864124..c6c79f36 100644
--- a/src/lib/specs-crawler.js
+++ b/src/lib/specs-crawler.js
@@ -91,18 +91,18 @@ async function crawlSpec(spec, crawlOptions) {
         else {
             result = await processSpecification(
                 urlToCrawl,
-              (spec, modules, speclist) => {
+                (spec, modules) => {
                     const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
                         window.reffy.mapIdsToHeadings() : null;
                     const res = {
                         crawled: window.location.toString()
                     };
                     modules.forEach(mod => {
-                        res[mod.property] = window.reffy[mod.name](spec, idToHeading, speclist);
+                        res[mod.property] = window.reffy[mod.name](spec, idToHeading);
                     });
                     return res;
                 },
-              [spec, crawlOptions.modules, crawlOptions.speclist],
+              [spec, crawlOptions.modules],
                 { quiet: crawlOptions.quiet,
                   forceLocalFetch: crawlOptions.forceLocalFetch,
                   ...cacheInfo}
@@ -294,7 +294,7 @@ async function saveSpecResults(spec, settings) {
 async function crawlList(speclist, crawlOptions) {
     // Make a shallow copy of crawl options object since we're going
     // to modify properties in place
-    crawlOptions = Object.assign({speclist}, crawlOptions);
+    crawlOptions = Object.assign({}, crawlOptions);
 
     // Expand list of processing modules to use if not already done
     crawlOptions.modules = expandBrowserModules(crawlOptions.modules);
diff --git a/src/postprocessing/annotate-links.js b/src/postprocessing/annotate-links.js
new file mode 100644
index 00000000..3bcfb92b
--- /dev/null
+++ b/src/postprocessing/annotate-links.js
@@ -0,0 +1,74 @@
+/**
+ * Post-processing module that annotates links extracts with the spec
+   shortname they link to
+ */
+const fs = require('fs');
+const path = require('path');
+
+function canonicalizeUrl(url, options) {
+    options = options || {};
+
+    let canon = url.replace(/^http:/, 'https:')
+            .split('#')[0]
+            .replace('index.html', '')
+            .replace('Overview.html', '')
+            .replace('cover.html', '')
+            .replace(/spec.whatwg.org\/.*/, 'spec.whatwg.org/')  // subpage to main document in whatwg
+            .replace(/w3.org\/TR\/(([^\/]+\/)+)[^\/]+\.[^\/]+$/, 'w3.org/TR/$1') // subpage to main document in w3c
+            .replace(/w3.org\/TR\/([^\/]+)$/, 'w3.org/TR/$1/') // enforce trailing slash
+            .replace(/w3c.github.io\/([^\/]+)$/, 'w3c.github.io/$1/') // enforce trailing slash for ED on GitHub
+        ;
+
+    if (options.datedToLatest) {
+        canon = canon.replace(
+            /w3.org\/TR\/[0-9]{4}\/[A-Z]+-(.*)-[0-9]{8}\/?/,
+            'w3.org/TR/$1/');
+    }
+
+    let equivalentUrls = (options.equivalents) ? options.equivalents[canon] : null;
+    if (Array.isArray(equivalentUrls)) {
+        return (options.returnAlternatives ? equivalentUrls : equivalentUrls[0]);
+    }
+    else {
+        return (equivalentUrls ? equivalentUrls : canon);
+    }
+}
+
+const needsSaving = {};
+
+module.exports = {
+  dependsOn: ['links'],
+  input: 'crawl',
+  property: 'links',
+
+  run: function(crawl, options) {
+    crawl.results.forEach(s => {
+      for (let link of Object.keys(s.links || {})) {
+	// Annotate with the spec to which the page belong if we can find one
+	const specUrl = canonicalizeUrl(link);
+	let matchingSpec = crawl.results.find(s => s?.release?.url === specUrl || s?.nightly?.url === specUrl || (s?.series?.currentSpecification === s?.shortname && (s?.series?.nightlyUrl === specUrl || s?.series?.releaseUrl === specUrl)) || s?.nightly?.pages?.includes(specUrl) || s?.release?.pages?.includes(specUrl));
+	if (matchingSpec) {
+	  needsSaving[s.shortname] = true;
+	  s.links[link].specShortname = matchingSpec.shortname;
+	}
+      }
+    });
+    return crawl;
+  },
+
+  save: async function({results}, options) {
+    return Promise.all(Object.values(results).map(async spec => {
+      const contents = {
+        spec: {
+          title: spec.title,
+          url: spec.crawled
+        },
+	links: spec.links
+      };
+      const json = JSON.stringify(contents, null, 2);
+      const folder = path.join(options.output, "links");
+      const filename = path.join(folder, `${spec.shortname}.json`);
+      return await fs.promises.writeFile(filename, json);
+    }));
+  }
+};

From f3e68e0e027c025223deabc847af90d2badab214 Mon Sep 17 00:00:00 2001
From: Dominique Hazael-Massieux <dom@w3.org>
Date: Mon, 5 Sep 2022 11:47:13 +0200
Subject: [PATCH 4/5] Simplify canonicalizeUrl code

More complex options are only needed in strudy, which will maintain its own version for now
See also https://github.com/w3c/reffy/pull/1055#discussion_r961818285
---
 src/postprocessing/annotate-links.js | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/src/postprocessing/annotate-links.js b/src/postprocessing/annotate-links.js
index 3bcfb92b..7d6520f5 100644
--- a/src/postprocessing/annotate-links.js
+++ b/src/postprocessing/annotate-links.js
@@ -5,10 +5,8 @@
 const fs = require('fs');
 const path = require('path');
 
-function canonicalizeUrl(url, options) {
-    options = options || {};
-
-    let canon = url.replace(/^http:/, 'https:')
+function canonicalizeUrl(url) {
+    return url.replace(/^http:/, 'https:')
             .split('#')[0]
             .replace('index.html', '')
             .replace('Overview.html', '')
@@ -18,20 +16,6 @@ function canonicalizeUrl(url, options) {
             .replace(/w3.org\/TR\/([^\/]+)$/, 'w3.org/TR/$1/') // enforce trailing slash
             .replace(/w3c.github.io\/([^\/]+)$/, 'w3c.github.io/$1/') // enforce trailing slash for ED on GitHub
         ;
-
-    if (options.datedToLatest) {
-        canon = canon.replace(
-            /w3.org\/TR\/[0-9]{4}\/[A-Z]+-(.*)-[0-9]{8}\/?/,
-            'w3.org/TR/$1/');
-    }
-
-    let equivalentUrls = (options.equivalents) ? options.equivalents[canon] : null;
-    if (Array.isArray(equivalentUrls)) {
-        return (options.returnAlternatives ? equivalentUrls : equivalentUrls[0]);
-    }
-    else {
-        return (equivalentUrls ? equivalentUrls : canon);
-    }
 }
 
 const needsSaving = {};

From 6565feb05144cecae400eff00cd67156f4b88b98 Mon Sep 17 00:00:00 2001
From: Dominique Hazael-Massieux <dom@w3.org>
Date: Mon, 5 Sep 2022 13:18:47 +0200
Subject: [PATCH 5/5] Use spec-mode for annotate-links post processing

As suggested in https://github.com/w3c/reffy/pull/1055#discussion_r961831703
---
 src/lib/specs-crawler.js             |  2 +-
 src/postprocessing/annotate-links.js | 43 +++++++++-------------------
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/src/lib/specs-crawler.js b/src/lib/specs-crawler.js
index c6c79f36..68bac0a3 100644
--- a/src/lib/specs-crawler.js
+++ b/src/lib/specs-crawler.js
@@ -294,7 +294,7 @@ async function saveSpecResults(spec, settings) {
 async function crawlList(speclist, crawlOptions) {
     // Make a shallow copy of crawl options object since we're going
     // to modify properties in place
-    crawlOptions = Object.assign({}, crawlOptions);
+    crawlOptions = Object.assign({speclist}, crawlOptions);
 
     // Expand list of processing modules to use if not already done
     crawlOptions.modules = expandBrowserModules(crawlOptions.modules);
diff --git a/src/postprocessing/annotate-links.js b/src/postprocessing/annotate-links.js
index 7d6520f5..b6aa58a3 100644
--- a/src/postprocessing/annotate-links.js
+++ b/src/postprocessing/annotate-links.js
@@ -22,37 +22,22 @@ const needsSaving = {};
 
 module.exports = {
   dependsOn: ['links'],
-  input: 'crawl',
+  input: 'spec',
   property: 'links',
 
-  run: function(crawl, options) {
-    crawl.results.forEach(s => {
-      for (let link of Object.keys(s.links || {})) {
-	// Annotate with the spec to which the page belong if we can find one
-	const specUrl = canonicalizeUrl(link);
-	let matchingSpec = crawl.results.find(s => s?.release?.url === specUrl || s?.nightly?.url === specUrl || (s?.series?.currentSpecification === s?.shortname && (s?.series?.nightlyUrl === specUrl || s?.series?.releaseUrl === specUrl)) || s?.nightly?.pages?.includes(specUrl) || s?.release?.pages?.includes(specUrl));
-	if (matchingSpec) {
-	  needsSaving[s.shortname] = true;
-	  s.links[link].specShortname = matchingSpec.shortname;
-	}
+  run: function(spec, {speclist}) {
+    if (!speclist || !speclist.length) {
+      console.error("No spec list passed as input, cannot annotate links in post-processing");
+      return spec;
+    }
+    for (let link of Object.keys(spec.links || {})) {
+      // Annotate with the spec to which the page belong if we can find one
+      const specUrl = canonicalizeUrl(link);
+      let matchingSpec = speclist.find(s => s?.release?.url === specUrl || s?.nightly?.url === specUrl || (s?.series?.currentSpecification === s?.shortname && (s?.series?.nightlyUrl === specUrl || s?.series?.releaseUrl === specUrl)) || s?.nightly?.pages?.includes(specUrl) || s?.release?.pages?.includes(specUrl));
+      if (matchingSpec) {
+        spec.links[link].specShortname = matchingSpec.shortname;
       }
-    });
-    return crawl;
-  },
-
-  save: async function({results}, options) {
-    return Promise.all(Object.values(results).map(async spec => {
-      const contents = {
-        spec: {
-          title: spec.title,
-          url: spec.crawled
-        },
-	links: spec.links
-      };
-      const json = JSON.stringify(contents, null, 2);
-      const folder = path.join(options.output, "links");
-      const filename = path.join(folder, `${spec.shortname}.json`);
-      return await fs.promises.writeFile(filename, json);
-    }));
+    }
+    return spec;
   }
 };