Re-work and simplify search plugin

2026-01-10 06:57:58 -05:00 · 2023-01-09 09:48:01 +00:00
parent 297d69e4a3
commit 1d8d0cf9fa
5 changed files with 140 additions and 142 deletions
--- a/gatsby-config.js
+++ b/gatsby-config.js
@@ -91,8 +91,6 @@ module.exports = {
      resolve: 'my-search-index',
      options: {
        enabled: true,
-        // Query that matches the element via which the HTML is included in the page template.
-        root: 'main',
        // Matching elements have their text added to the index. First match wins.
        chunkTypes: [
          {query: 'figcaption', label: 'Figure caption'},
@@ -103,14 +101,14 @@ module.exports = {
          {query: 'h3, h4, h5, h6', label: 'Heading'},
          {query: 'p', label: 'Paragraph'},
        ],
-        // Note, only pages under src/md/pages have a "hide" property.
-        pageFilter: '{frontmatter: {hide: {eq: false}}}',
        exclude: {
-          // Speed up the build (these are excluded from the index by pageFilter, anyway).
-          pages: ['/404.html', '/annotated-spec/', '/contact/', '/contents/', '/search/', '/'],
-          // Elements matching this query are ignored completely, including their text.
+          // Note, only pages under src/md/pages have a "hide" property.
+          frontmatter: [{hide: true}, {hide: null}],
+          // The frontmatter filter takes care of excluding a good set of pages for now.
+          pages: [],
+          // Elements matching this query are ignored completely, including their text:
          ignore: 'svg *, details *, mtable *, mrow *, [aria-hidden="true"] *, .footnote-ref',
-          // Chunks matching this query are excluded as duplicates (to handle nested matches).
+          // Chunks matching this query are excluded as duplicates (to handle nested matches):
          dedup: '[id^="fn-"] *, figcaption *, li *',
        }
      },
--- a/plugins/my-search-index/gatsby-node.js
+++ b/plugins/my-search-index/gatsby-node.js
@@ -1,7 +1,7 @@
 const cheerio = require('cheerio')

 /*
- * Creates a GraphQL node containing data for the local search
+ * Creates GraphQL nodes containing data for the local search
 */

 // Concatenate all text in child nodes while respecting exclusions
@@ -9,15 +9,11 @@ const getText = ($, node, exclude) => {

  let text = ''

-  if ($(node).is(exclude.ignore)) {
-    return text
-  }
-
  if (node.type === 'text') {
    text += node.data
  }

-  $(node).contents().each(function (i, e) {
+  $(node).contents().not(exclude.ignore).each(function (i, e) {
    text += getText($, e, exclude)
  })

@@ -25,44 +21,65 @@ const getText = ($, node, exclude) => {
 }

 // Recurse until we find an element we want to treat as a chunk, then get all its text content.
-const getChunks = ($, node, chunkTypes, exclude) => {
+const getChunks = ($, node, chunkTypes, exclude, counts) => {
+
+  if (counts === undefined) {
+    counts = Array(chunkTypes.length).fill(0)
+  }

  const chunks = []

-  if ($(node).is(exclude.ignore) || $(node).is(exclude.dedup)) {
-    return chunks
-  }
+  for (let idx = 0; idx < chunkTypes.length; idx++) {

-  chunkTypes.every( (type) => {
+    const type = chunkTypes[idx]
    if ($(node).is(type.query)) {
+
+      const tagName = $(node).get(0).tagName
+      let id = $(node).attr('id')
+      if ( id === undefined) {
+        id = tagName + '_' + counts[idx]
+        $(node).attr('id', id)
+        ++counts[idx]
+      }
+
      const text = getText($, node, exclude)
      if (text !== '') {
        chunks.push(
          {
-            type: $(node).get(0).tagName,
+            type: tagName,
            label: type.label,
-            id: $(node).attr('id'),
+            id: id,
            text: text,
          })
      }
-      // Add a node only once
-      return false
+      break
    }
-    return true
+  }
+
+  $(node).children().not(exclude.ignore).not(exclude.dedup).each(function (i, e) {
+    chunks.push(...getChunks($, e, chunkTypes, exclude, counts))
  })

-  $(node).children().each(function (i, e) {
-    chunks.push(getChunks($, e, chunkTypes, exclude))
-  })
+  return chunks
+}

-  return chunks.flat()
+const isExcludedFrontmatter = (frontmatter, exclude) => {
+
+  for (let i = 0; i < exclude.frontmatter.length; i++) {
+    const test = exclude.frontmatter[i]
+    const [key, ...rest] = Object.keys(test)
+    if (Object.prototype.hasOwnProperty.call(frontmatter, key)
+        && frontmatter[key] == test[key]) {
+      return true
+    }
+  }
+  return false
 }

 exports.createPages = async (
  {
-    actions,
+    actions: { createNode },
    graphql,
-    reporter,
    createNodeId,
    createContentDigest,
  }, pluginOptions,
@@ -70,77 +87,83 @@ exports.createPages = async (

  const {
    enabled = true,
-    root = '',
    chunkTypes = [],
-    pageFilter = '{}',
-    exclude = {pages: [], ignore: '', dedup: ''},
+    exclude = {frontmatter: [], pages: [], ignore: '', dedup: ''},
  } = pluginOptions

-  const mySearchData = []
-
-  if (enabled) {
-
-    const result = await graphql(`
-      {
-        allMarkdownRemark(filter: ${pageFilter}) {
-          edges {
-            node {
-              html
-              frontmatter {
-                path
-                titles
-              }
+  const result = await graphql(`
+    {
+      allMarkdownRemark {
+        edges {
+          node {
+            html
+            frontmatter {
+              path
+              index
+              sequence
+              titles
+              hide
            }
          }
        }
      }
-    `)
-
-    const pages = result.data.allMarkdownRemark.edges
-
-    await Promise.all(pages.map(async (page) => {
-
-      const frontmatter = page.node.frontmatter
-      if (frontmatter !== undefined && exclude.pages.indexOf(frontmatter.path) === -1) {
-
-        // Get the HTML. This is the contents of `dangerouslySetInnerHTML={{ __html: html }}`
-        // in the page template.
-        const $ = cheerio.load(page.node.html, null, false)
-
-        // Changes to the HTML AST made here will not persist, but we need to do
-        // exactly the same as in gatsby-ssr so that our ids end up consistent.
-        chunkTypes.forEach( (type) => {
-          $(type.query).not(exclude.ignore).not(exclude.dedup).not('[id]').each( function (i, e) {
-            $(this).attr('id', $(this).get(0).tagName + '_' + i)
-          })
-        })
-
-        const chunks = getChunks($, $.root(), chunkTypes, exclude)
-
-        mySearchData.push({
-          path: frontmatter.path,
-          title: frontmatter.titles.filter(x => x !== '').join(' | '),
-          chunks: chunks,
-        })
-      }
-    }))
-  }
-
-  name = 'mySearchData'
-  actions.createNode({
-    id: createNodeId(name),
-    data: mySearchData,
-    internal: {
-      type: name,
-      contentDigest: createContentDigest(mySearchData)
    }
-  })
+  `)
+
+  const pages = result.data.allMarkdownRemark.edges
+
+  await Promise.all(pages.map(async (page) => {
+
+    const $ = cheerio.load(page.node.html, null, false)
+
+    const frontmatter = page.node.frontmatter
+    let chunks = []
+
+    if (enabled
+        && frontmatter !== undefined
+        && isExcludedFrontmatter(frontmatter, exclude) === false
+        && exclude.pages.indexOf(frontmatter.path) === -1) {
+
+      chunks = getChunks($, $.root(), chunkTypes, exclude)
+    }
+
+    // It seems to be hard to modify the underlying MarkdownRemark node's HTML, so we add
+    // the modified HTML to a new node and deal with it in the page template.
+    const nodeData = {
+      frontmatter: {
+        path: frontmatter.path,
+        index: frontmatter.index,
+        titles: frontmatter.titles,
+        sequence: frontmatter.sequence,
+      },
+      chunks: chunks,
+      html: $.html(),
+    }
+
+    createNode({
+      ...nodeData,
+      id: createNodeId(nodeData.frontmatter.path),
+      internal: {
+        type: 'mySearchData',
+        contentDigest: createContentDigest(nodeData)
+      }
+    })
+  }))
 }

 exports.createSchemaCustomization = ({ actions: { createTypes } }) => {
  createTypes(`
+    type Frontmatter {
+      path: String!
+      index: [Int]
+      titles: [String]
+      sequence: Int
+    }
+
    type mySearchData implements Node {
-      data: JSON
+      frontmatter: Frontmatter!
+      chunks: JSON
+      html: String
    }
  `)
 }
--- a/plugins/my-search-index/gatsby-ssr.js
+++ b/plugins/my-search-index/gatsby-ssr.js
@@ -1,33 +0,0 @@
-const { renderToString } = require('react-dom/server')
-const cheerio = require('cheerio')
-
-/*
- * Adds ID anchors to all elements that might appear in the local search
- */
-
-exports.replaceRenderer = ({ pathname, bodyComponent, replaceBodyHTMLString }, pluginOptions) => {
-
-  const {
-    enabled = true,
-    root = 'body',
-    chunkTypes = [],
-    exclude = {pages: [], ignore: '', dedup: ''},
-  } = pluginOptions
-
-  if (enabled && exclude.pages.indexOf(pathname) == -1) {
-
-    // Get the HTML
-    const html = renderToString(bodyComponent)
-    const $ = cheerio.load(html, null, false)
-
-    // Modify the HTML - add id attributes where required.
-    chunkTypes.forEach( (type) => {
-      $(root + ' *').filter(type.query).not(exclude.ignore).not(exclude.dedup).not('[id]').each( function (i, e) {
-        $(this).attr('id', $(this).get(0).tagName + '_' + i)
-      })
-    })
-
-    // Replace the HTML
-    replaceBodyHTMLString($.html())
-  }
-}
--- a/src/components/search.js
+++ b/src/components/search.js
@@ -14,15 +14,16 @@ const getSearchResults = (query, data) => {
  }

  // Match the starts of words only. The "d" flag gives us the matching indices.
-  const regex = RegExp('(^|\\W|_)' + escapeRegExp(query.searchText), 'gd' + (query.isCaseSensitive ? '' : 'i'))
+  const regex = RegExp('(^|\\W|_)' + escapeRegExp(query.searchText),
+                       'gd' + (query.isCaseSensitive ? '' : 'i'))

-  const result = data.map( (page) => {
+  const result = data.map( ({ node }) => {

    let score = 0
    const matches = []
-    for (let i = 0; i < page.chunks?.length; i++) {
+    for (let i = 0; i < node.chunks?.length; i++) {

-      let chunk = page.chunks[i]
+      let chunk = node.chunks[i]
      let match
      const indices = []
      while ((match = regex.exec(chunk.text)) !== null) {
@@ -44,27 +45,35 @@ const getSearchResults = (query, data) => {
    }

    return matches.length === 0 ? null : {
-      url: page.path,
-      title: page.title,
+      url: node.frontmatter.path,
+      title: node.frontmatter.titles.filter(x => x).join(' | '),
      matches: matches,
      score: score,
    }
  })

-  return result.filter(x => x !== null).sort((a, b) => (b.score - a.score))
+  return result.filter(x => x).sort((a, b) => (b.score - a.score))
 }

 const Search = () => {

  const queryData = useStaticQuery(graphql`
    query {
-      mySearchData {
-        data
+      allMySearchData {
+        edges {
+          node {
+            frontmatter {
+              path
+              titles
+            }
+            chunks
+          }
+        }
      }
    }
  `)

-  const searchData = queryData.mySearchData.data
+  const searchData = queryData.allMySearchData.edges

  const [searchQuery, setQuery] = React.useState({
    searchText: '',
@@ -82,7 +91,7 @@ const Search = () => {
      return { ...previousState, isCaseSensitive: !previousState.isCaseSensitive }
    });
  }
-  
+
  const results = getSearchResults(searchQuery, searchData)

  const pages = results.map((result) => {
@@ -93,7 +102,7 @@ const Search = () => {
          <span className='match-text'>
            {match.text.substring(indices[0], indices[1])}
          </span>,
-          (i === match.indices.length -1) ? match.text.substring(indices[1]) : '',
+          (i === match.indices.length - 1) ? match.text.substring(indices[1]) : '',
        ]
      })
      return (
@@ -141,7 +150,7 @@ const Search = () => {
          <label htmlFor="is-case-sensitive">Case sensitive</label>
        </span>
      </div>
-      <div id="search-results">  
+      <div id="search-results">
        {results.length > 0 ? (
          <ul>
            {pages}
--- a/src/templates/pageTemplate.js
+++ b/src/templates/pageTemplate.js
@@ -17,14 +17,14 @@ import "../css/page.css"

 export function Head({ data }) {

-  const { markdownRemark, site } = data
-  const { frontmatter } = markdownRemark
+  const { mySearchData, site } = data
+  const frontmatter = mySearchData.frontmatter

  const indexArray = frontmatter.index

  var pageTitle = site.siteMetadata.title
  if (frontmatter.titles !== null) {
-    const titles = frontmatter.titles.filter(x => x !== "")
+    const titles = frontmatter.titles.filter(x => x !== '')
    const number = (indexArray.length >= 2) ? indexArray.join('.') : ''
    pageTitle += ' | ' + number + ' ' + titles[titles.length - 1]
  }
@@ -36,16 +36,17 @@ export function Head({ data }) {

 export default function Template({ data }) {

-  const { html, frontmatter } = data.markdownRemark
+  const { html, frontmatter } = data.mySearchData
  const indexArray = frontmatter.index
+  const path = frontmatter.path

-  const pageExtras = frontmatter.path.startsWith('/search')
+  const pageExtras = path.startsWith('/search')
        ? <Search />
        : <Subsections indexArray={indexArray} />

  return (
      <>
-        <Banner path={frontmatter.path} />
+        <Banner path={path} />
        <div id="page">
          <Sidebar index={frontmatter.index} />
          <div id="main-content">
@@ -58,7 +59,7 @@ export default function Template({ data }) {
            <Footer />
            <PrevNext seq={frontmatter.sequence} />
          </div>
-          <PageNavi path={frontmatter.path} />
+          <PageNavi path={path} />
          <FootnoteTooltips />
          </div>
        <PrintScripts />
@@ -68,7 +69,7 @@ export default function Template({ data }) {

 export const pageQuery = graphql`
  query($path: String!) {
-    markdownRemark(frontmatter: { path: { eq: $path } }) {
+    mySearchData(frontmatter: { path: { eq: $path } }) {
      frontmatter {
        index
        path