Desktop: Fixes #8448: Merge changes from upstream turndown project (#8468)

2025-03-03 15:32:30 +02:00 · 2023-07-17 04:19:34 -07:00 · 2023-07-17 04:19:34 -07:00 · 1e2e8ed099
commit 1e2e8ed099
parent 4d7399973e
11 changed files with 173 additions and 126 deletions
--- a/packages/app-cli/tests/html_to_md/bold_italic_with_spaces.html
+++ b/packages/app-cli/tests/html_to_md/bold_italic_with_spaces.html
@ -0,0 +1 @@
+<p><b>&nbsp; &nbsp;A test...&nbsp;</b>Test</p>
--- a/packages/app-cli/tests/html_to_md/bold_italic_with_spaces.md
+++ b/packages/app-cli/tests/html_to_md/bold_italic_with_spaces.md
@ -0,0 +1 @@
+&nbsp; &nbsp;**A test...**&nbsp;Test
--- a/packages/app-cli/tests/html_to_md/picture_with_no_img.md
+++ b/packages/app-cli/tests/html_to_md/picture_with_no_img.md
@ -1 +1 @@
-Some pictures: ![](https://static2.cbrimages.com/wp-content/uploads/2018/09/Die-01-cvrA.jpg?q=35&w=588&h=900&fit=crop&dpr=1.5) ![](http://example.com/test.gif)
+Some pictures: ![](https://static2.cbrimages.com/wp-content/uploads/2018/09/Die-01-cvrA.jpg?q=35&w=588&h=900&fit=crop&dpr=1.5)![](http://example.com/test.gif)
--- a/packages/app-cli/tests/html_to_md/table_with_empty_header.md
+++ b/packages/app-cli/tests/html_to_md/table_with_empty_header.md
@ -5,5 +5,6 @@
 | [Source](https://github.com/nim-lang/nim) | The github project |
 | [nimble](https://github.com/nim-lang/nimble) | The nim package manager |
 | [choosenim](https://github.com/dom96/choosenim) | Toolchain installer |
+|     |     |
 | **Community** |     |
 | [Forums](https://forum.nim-lang.org) | An async discussion board |
--- a/packages/turndown/README.md
+++ b/packages/turndown/README.md
@ -19,6 +19,9 @@ Convert HTML into Markdown with JavaScript.
 - Allow a rule to specify whether it wants its content escaped or not
 - Handle [non-OL ordered lists](https://developer.mozilla.org/en-US/docs/Web/CSS/list-style-type)
 - Added option `preserveImageTagsWithSize` to keep `<img/>` tags as HTML (no Markdown conversion) if they have width or height attributes
+- Added support for replacing unicode nonbreaking spaces with `&nbsp;` in output markdown.
+
+The `src/` folder of this fork is currently based on commit `97e4535ca76bb2e70d9caa2aa4d4686956b06d44` of the [upstream Turndown project](https://github.com/mixmark-io/turndown). The `test` and `config` folders are based on an earlier commit.

 ### to-markdown has been renamed to Turndown. See the [migration guide](https://github.com/domchristie/to-markdown/wiki/Migrating-from-to-markdown-to-Turndown) for details.

--- a/packages/turndown/src/collapse-whitespace.js
+++ b/packages/turndown/src/collapse-whitespace.js
@ -49,7 +49,7 @@ function collapseWhitespace (options) {
  if (!element.firstChild || isPre(element)) return

  var prevText = null
-  var prevVoid = false
+  var keepLeadingWs = false

  var prev = null
  var node = next(prev, element, isPre)
@ -58,13 +58,12 @@ function collapseWhitespace (options) {
  // added, which results in multiple spaces. This spaces are then incorrectly interpreted as a code block by renderers.
  // So by keeping track of this, we make sure that only one space at most is added.
  var prevTextIsOnlySpaces = false;
-
  while (node !== element) {
    if (node.nodeType === 3 || node.nodeType === 4) { // Node.TEXT_NODE or Node.CDATA_SECTION_NODE
      var text = node.data.replace(/[ \r\n\t]+/g, ' ')

      if ((!prevText || / $/.test(prevText.data)) &&
-          !prevVoid && text[0] === ' ') {
+          !keepLeadingWs && text[0] === ' ') {
        text = text.substr(1)
      }

@ -87,11 +86,14 @@ function collapseWhitespace (options) {
        }

        prevText = null
-        prevVoid = false
-      } else if (isVoid(node)) {
-        // Avoid trimming space around non-block, non-BR void elements.
+        keepLeadingWs = false
+      } else if (isVoid(node) || isPre(node)) {
+        // Avoid trimming space around non-block, non-BR void elements and inline PRE.
        prevText = null
-        prevVoid = true
+        keepLeadingWs = true
+      } else if (prevText) {
+        // Drop protection if set previously.
+        keepLeadingWs = false
      }
    } else {
      node = remove(node)
--- a/packages/turndown/src/commonmark-rules.js
+++ b/packages/turndown/src/commonmark-rules.js
@ -215,11 +215,25 @@ rules.fencedCodeBlock = {

    var className = handledNode.className || ''
    var language = (className.match(/language-(\S+)/) || [null, ''])[1]
+    var code = content
+
+    var fenceChar = options.fence.charAt(0)
+    var fenceSize = 3
+    var fenceInCodeRegex = new RegExp('^' + fenceChar + '{3,}', 'gm')
+
+    var match
+    while ((match = fenceInCodeRegex.exec(code))) {
+      if (match[0].length >= fenceSize) {
+        fenceSize = match[0].length + 1
+      }
+    }
+
+    var fence = repeat(fenceChar, fenceSize)

    return (
-      '\n\n' + options.fence + language + '\n' +
-      content + 
-      '\n' + options.fence + '\n\n'
+      '\n\n' + fence + language + '\n' +
+      code.replace(/\n$/, '') +
+      '\n' + fence + '\n\n'
    )
  }
 }
@ -407,19 +421,15 @@ rules.code = {
  },

  replacement: function (content) {
-    if (!content.trim()) return ''
+    if (!content) return ''
+    content = content.replace(/\r?\n|\r/g, ' ')

+    var extraSpace = /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : ''
    var delimiter = '`'
-    var leadingSpace = ''
-    var trailingSpace = ''
-    var matches = content.match(/`+/gm)
-    if (matches) {
-      if (/^`/.test(content)) leadingSpace = ' '
-      if (/`$/.test(content)) trailingSpace = ' '
-      while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`'
-    }
+    var matches = content.match(/`+/gm) || []
+    while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`'

-    return delimiter + leadingSpace + content + trailingSpace + delimiter
+    return delimiter + extraSpace + content + extraSpace + delimiter
  }
 }

--- a/packages/turndown/src/node.js
+++ b/packages/turndown/src/node.js
@ -1,42 +1,56 @@
-import { isBlock, isVoid, hasVoid, isCodeBlock } from './utilities'
+import { isBlock, isVoid, hasVoid, isCodeBlock, isMeaningfulWhenBlank, hasMeaningfulWhenBlank } from './utilities'

-export default function Node (node) {
+export default function Node (node, options) {
  node.isBlock = isBlock(node)
-  node.isCode = node.nodeName.toLowerCase() === 'code' || node.parentNode.isCode || isCodeBlock(node);
+  node.isCode = node.nodeName === 'CODE' || node.parentNode.isCode || isCodeBlock(node);
  node.isBlank = isBlank(node)
-  node.flankingWhitespace = flankingWhitespace(node)
+  node.flankingWhitespace = flankingWhitespace(node, options)
  return node
 }

 function isBlank (node) {
  return (
-    ['A', 'TH', 'TD'].indexOf(node.nodeName) === -1 &&
-    /^\s*$/i.test(node.textContent) &&
    !isVoid(node) &&
-    !hasVoid(node)
+    !isMeaningfulWhenBlank(node) &&
+    /^\s*$/i.test(node.textContent) &&
+    !hasVoid(node) &&
+    !hasMeaningfulWhenBlank(node)
  )
 }

-function flankingWhitespace (node) {
-  var leading = ''
-  var trailing = ''
-
-  if (!node.isBlock) {
-    var hasLeading = /^[ \r\n\t]/.test(node.textContent)
-    var hasTrailing = /[ \r\n\t]$/.test(node.textContent)
-
-    if (hasLeading && !isFlankedByWhitespace('left', node)) {
-      leading = ' '
-    }
-    if (hasTrailing && !isFlankedByWhitespace('right', node)) {
-      trailing = ' '
-    }
+function flankingWhitespace (node, options) {
+  if (node.isBlock || (options.preformattedCode && node.isCode)) {
+    return { leading: '', trailing: '' }
  }

-  return { leading: leading, trailing: trailing }
+  var edges = edgeWhitespace(node.textContent)
+
+  // abandon leading ASCII WS if left-flanked by ASCII WS
+  if (edges.leadingAscii && isFlankedByWhitespace('left', node, options)) {
+    edges.leading = edges.leadingNonAscii
+  }
+
+  // abandon trailing ASCII WS if right-flanked by ASCII WS
+  if (edges.trailingAscii && isFlankedByWhitespace('right', node, options)) {
+    edges.trailing = edges.trailingNonAscii
+  }
+
+  return { leading: edges.leading, trailing: edges.trailing }
 }

-function isFlankedByWhitespace (side, node) {
+function edgeWhitespace (string) {
+  var m = string.match(/^(([ \t\r\n]*)(\s*))(?:(?=\S)[\s\S]*\S)?((\s*?)([ \t\r\n]*))$/)
+  return {
+    leading: m[1], // whole string for whitespace-only strings
+    leadingAscii: m[2],
+    leadingNonAscii: m[3],
+    trailing: m[4], // empty for whitespace-only strings
+    trailingNonAscii: m[5],
+    trailingAscii: m[6]
+  }
+}
+
+function isFlankedByWhitespace (side, node, options) {
  var sibling
  var regExp
  var isFlanked
@ -52,6 +66,8 @@ function isFlankedByWhitespace (side, node) {
  if (sibling) {
    if (sibling.nodeType === 3) {
      isFlanked = regExp.test(sibling.nodeValue)
+    } else if (options.preformattedCode && sibling.nodeName === 'CODE') {
+      isFlanked = false
    } else if (sibling.nodeType === 1 && !isBlock(sibling)) {
      isFlanked = regExp.test(sibling.textContent)
    }
--- a/packages/turndown/src/root-node.js
+++ b/packages/turndown/src/root-node.js
@ -2,7 +2,7 @@ import collapseWhitespace from './collapse-whitespace'
 import HTMLParser from './html-parser'
 import { isBlock, isVoid } from './utilities'

-export default function RootNode (input) {
+export default function RootNode (input, options) {
  var root
  if (typeof input === 'string') {
    var doc = htmlParser().parseFromString(
@ -19,7 +19,8 @@ export default function RootNode (input) {
  collapseWhitespace({
    element: root,
    isBlock: isBlock,
-    isVoid: isVoid
+    isVoid: isVoid,
+    isPre: options.preformattedCode ? isPreOrCode : null
  })

  return root
@ -30,3 +31,7 @@ function htmlParser () {
  _htmlParser = _htmlParser || new HTMLParser()
  return _htmlParser
 }
+
+function isPreOrCode (node) {
+  return node.nodeName === 'PRE' || node.nodeName === 'CODE'
+}
--- a/packages/turndown/src/turndown.js
+++ b/packages/turndown/src/turndown.js
@ -1,11 +1,24 @@
 import COMMONMARK_RULES from './commonmark-rules'
 import Rules from './rules'
-import { extend, isCodeBlock } from './utilities'
+import { extend, isCodeBlock, trimLeadingNewlines, trimTrailingNewlines } from './utilities'
 import RootNode from './root-node'
 import Node from './node'
 var reduce = Array.prototype.reduce
-var leadingNewLinesRegExp = /^\n*/
-var trailingNewLinesRegExp = /\n*$/
+var escapes = [
+  [/\\/g, '\\\\'],
+  [/\*/g, '\\*'],
+  [/^-/g, '\\-'],
+  [/^\+ /g, '\\+ '],
+  [/^(=+)/g, '\\$1'],
+  [/^(#{1,6}) /g, '\\$1 '],
+  [/`/g, '\\`'],
+  [/^~~~/g, '\\~~~'],
+  [/\[/g, '\\['],
+  [/\]/g, '\\]'],
+  [/^>/g, '\\>'],
+  [/_/g, '\\_'],
+  [/^(\d+)\. /g, '$1\\. ']
+]

 export default function TurndownService (options) {
  if (!(this instanceof TurndownService)) return new TurndownService(options)
@ -23,7 +36,9 @@ export default function TurndownService (options) {
    linkReferenceStyle: 'full',
    anchorNames: [],
    br: '  ',
+    nonbreakingSpace: '&nbsp;',
    disableEscapeContent: false,
+    preformattedCode: false,
    blankReplacement: function (content, node) {
      return node.isBlock ? '\n\n' : ''
    },
@ -56,7 +71,7 @@ TurndownService.prototype = {

    if (input === '') return ''

-    var output = process.call(this, new RootNode(input))
+    var output = process.call(this, new RootNode(input, this.options))
    return postProcess.call(this, output)
  },

@ -128,48 +143,9 @@ TurndownService.prototype = {
   */

  escape: function (string) {
-    return (
-      string
-        // Escape backslash escapes!
-        .replace(/\\(\S)/g, '\\\\$1')
-
-        // Escape headings
-        .replace(/^(#{1,6} )/gm, '\\$1')
-
-        // Escape hr
-        .replace(/^([-*_] *){3,}$/gm, function (match, character) {
-          return match.split(character).join('\\' + character)
-        })
-
-        // Escape ol bullet points
-        .replace(/^(\W* {0,3})(\d+)\. /gm, '$1$2\\. ')
-
-        // Escape ul bullet points
-        .replace(/^([^\\\w]*)[*+-] /gm, function (match) {
-          return match.replace(/([*+-])/g, '\\$1')
-        })
-
-        // Escape blockquote indents
-        .replace(/^(\W* {0,3})> /gm, '$1\\> ')
-
-        // Escape em/strong *
-        .replace(/\*+(?![*\s\W]).+?\*+/g, function (match) {
-          return match.replace(/\*/g, '\\*')
-        })
-
-        // Escape em/strong _
-        .replace(/_+(?![_\s\W]).+?_+/g, function (match) {
-          return match.replace(/_/g, '\\_')
-        })
-
-        // Escape code _
-        .replace(/`+(?![`\s\W]).+?`+/g, function (match) {
-          return match.replace(/`/g, '\\`')
-        })
-
-        // Escape link brackets
-        .replace(/[\[\]]/g, '\\$&') // eslint-disable-line no-useless-escape
-    )
+    return escapes.reduce(function (accumulator, escape) {
+      return accumulator.replace(escape[0], escape[1])
+    }, string)
  }
 }

@ -186,7 +162,7 @@ function process (parentNode, escapeContent = 'auto') {

  var self = this
  return reduce.call(parentNode.childNodes, function (output, node) {
-    node = new Node(node)
+    node = new Node(node, self.options)

    var replacement = ''
    if (node.nodeType === 3) {
@ -239,39 +215,35 @@ function replacementForNode (node) {
  var content = process.call(this, node, rule.escapeContent ? rule.escapeContent() : 'auto')
  var whitespace = node.flankingWhitespace
  if (whitespace.leading || whitespace.trailing) content = content.trim()
+
+  const replaceNonbreakingSpaces = space => {
+    // \u{00A0} is a nonbreaking space
+    return space.replace(/\u{00A0}/ug, this.options.nonbreakingSpace);
+  };
+
  return (
-    whitespace.leading +
-    rule.replacement(content, node, this.options) +
-    whitespace.trailing
+    replaceNonbreakingSpaces(whitespace.leading) +
+    replaceNonbreakingSpaces(rule.replacement(content, node, this.options)) +
+    replaceNonbreakingSpaces(whitespace.trailing)
  )
 }

 /**
- * Determines the new lines between the current output and the replacement
+ * Joins replacement to the current output with appropriate number of new lines
 * @private
 * @param {String} output The current conversion output
 * @param {String} replacement The string to append to the output
- * @returns The whitespace to separate the current output and the replacement
+ * @returns Joined output
 * @type String
 */

-function separatingNewlines (output, replacement) {
-  var newlines = [
-    output.match(trailingNewLinesRegExp)[0],
-    replacement.match(leadingNewLinesRegExp)[0]
-  ].sort()
-  var maxNewlines = newlines[newlines.length - 1]
-  return maxNewlines.length < 2 ? maxNewlines : '\n\n'
-}
+function join (output, replacement) {
+  var s1 = trimTrailingNewlines(output)
+  var s2 = trimLeadingNewlines(replacement)
+  var nls = Math.max(output.length - s1.length, replacement.length - s2.length)
+  var separator = '\n\n'.substring(0, nls)

-function join (string1, string2) {
-  var separator = separatingNewlines(string1, string2)
-
-  // Remove trailing/leading newlines and replace with separator
-  string1 = string1.replace(trailingNewLinesRegExp, '')
-  string2 = string2.replace(leadingNewLinesRegExp, '')
-
-  return string1 + separator + string2
+  return s1 + separator + s2
 }

 /**
--- a/packages/turndown/src/utilities.js
+++ b/packages/turndown/src/utilities.js
@ -14,31 +14,67 @@ export function repeat (character, count) {
  return Array(count + 1).join(character)
 }

+export function trimLeadingNewlines (string) {
+  return string.replace(/^\n*/, '')
+}
+
+export function trimTrailingNewlines (string) {
+  // avoid match-at-end regexp bottleneck, see #370
+  var indexEnd = string.length
+  while (indexEnd > 0 && string[indexEnd - 1] === '\n') indexEnd--
+  return string.substring(0, indexEnd)
+}
+
 export var blockElements = [
-  'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas',
-  'center', 'dd', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption',
-  'figure', 'footer', 'form', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-  'header', 'hgroup', 'hr', 'html', 'isindex', 'li', 'main', 'menu', 'nav',
-  'noframes', 'noscript', 'ol', 'output', 'p', 'pre', 'section', 'table',
-  'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'ul'
+  'ADDRESS', 'ARTICLE', 'ASIDE', 'AUDIO', 'BLOCKQUOTE', 'BODY', 'CANVAS',
+  'CENTER', 'DD', 'DIR', 'DIV', 'DL', 'DT', 'FIELDSET', 'FIGCAPTION', 'FIGURE',
+  'FOOTER', 'FORM', 'FRAMESET', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HEADER',
+  'HGROUP', 'HR', 'HTML', 'ISINDEX', 'LI', 'MAIN', 'MENU', 'NAV', 'NOFRAMES',
+  'NOSCRIPT', 'OL', 'OUTPUT', 'P', 'PRE', 'SECTION', 'TABLE', 'TBODY', 'TD',
+  'TFOOT', 'TH', 'THEAD', 'TR', 'UL'
 ]

 export function isBlock (node) {
-  return blockElements.indexOf(node.nodeName.toLowerCase()) !== -1
+  return is(node, blockElements)
 }

 export var voidElements = [
-  'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
-  'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
+  'AREA', 'BASE', 'BR', 'COL', 'COMMAND', 'EMBED', 'HR', 'IMG', 'INPUT',
+  'KEYGEN', 'LINK', 'META', 'PARAM', 'SOURCE', 'TRACK', 'WBR'
 ]

 export function isVoid (node) {
-  return voidElements.indexOf(node.nodeName.toLowerCase()) !== -1
+  return is(node, voidElements)
 }

-var voidSelector = voidElements.join()
 export function hasVoid (node) {
-  return node.querySelector && node.querySelector(voidSelector)
+  return has(node, voidElements)
+}
+
+var meaningfulWhenBlankElements = [
+  'A', 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TH', 'TD', 'IFRAME', 'SCRIPT',
+  'AUDIO', 'VIDEO'
+]
+
+export function isMeaningfulWhenBlank (node) {
+  return is(node, meaningfulWhenBlankElements)
+}
+
+export function hasMeaningfulWhenBlank (node) {
+  return has(node, meaningfulWhenBlankElements)
+}
+
+function is (node, tagNames) {
+  return tagNames.indexOf(node.nodeName) >= 0
+}
+
+function has (node, tagNames) {
+  return (
+    node.getElementsByTagName &&
+    tagNames.some(function (tagName) {
+      return node.getElementsByTagName(tagName).length
+    })
+  )
 }

 // To handle code that is presented as below (see https://github.com/laurent22/joplin/issues/573)