import { repeat, isCodeBlockSpecialCase1, isCodeBlockSpecialCase2, isCodeBlock, getStyleProp } from './utilities'
const Entities = require('html-entities').AllHtmlEntities;
const htmlentities = (new Entities()).encode;
function attributesHtml(attributes, options = null) {
if (!attributes) return '';
options = Object.assign({}, {
skipEmptyClass: false,
}, options);
const output = [];
for (let attr of attributes) {
if (attr.name === 'class' && !attr.value && options.skipEmptyClass) continue;
output.push(`${attr.name}="${htmlentities(attr.value)}"`);
}
return output.join(' ');
}
var rules = {}
rules.paragraph = {
filter: 'p',
replacement: function (content) {
// If the line starts with a nonbreaking space, replace it. By default, the
// markdown renderer removes leading non-HTML-escaped nonbreaking spaces. However,
// because the space is nonbreaking, we want to keep it.
// \u00A0 is a nonbreaking space.
const leadingNonbreakingSpace = /^\u{00A0}/ug;
content = content.replace(leadingNonbreakingSpace, ' ');
// Paragraphs that are truly empty (not even containing nonbreaking spaces)
// take up by default no space. Output nothing.
if (content === '') {
return '';
}
return '\n\n' + content + '\n\n'
}
}
rules.lineBreak = {
filter: 'br',
replacement: function (_content, node, options, previousNode) {
let brReplacement = options.br + '\n';
// Code blocks may include s -- replacing them should not be necessary
// in code blocks.
if (node.isCode) {
brReplacement = '\n';
} else if (previousNode && previousNode.nodeName === 'BR') {
brReplacement = ' ';
}
return brReplacement;
}
}
rules.heading = {
filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
replacement: function (content, node, options) {
var hLevel = Number(node.nodeName.charAt(1))
if (options.headingStyle === 'setext' && hLevel < 3) {
var underline = repeat((hLevel === 1 ? '=' : '-'), content.length)
return (
'\n\n' + content + '\n' + underline + '\n\n'
)
} else {
return '\n\n' + repeat('#', hLevel) + ' ' + content + '\n\n'
}
}
}
// ==============================
// Joplin format support
// ==============================
rules.highlight = {
filter: 'mark',
replacement: function (content, node, options) {
return '==' + content + '=='
}
}
// Unlike strikethrough and mark formatting, insert, sup and sub aren't
// widespread enough to automatically convert them to Markdown, but keep them as
// HTML anyway. Another issue is that we use "~" for subscript but that's
// actually the syntax for strikethrough on GitHub, so it's best to keep it as
// HTML to avoid any ambiguity.
rules.insert = {
filter: function (node, options) {
// TinyMCE represents this either with an tag (when pressing the
// toolbar button) or using style "text-decoration" (when using shortcut
// Cmd+U)
//
// https://github.com/laurent22/joplin/issues/5480
if (node.nodeName === 'INS') return true;
return getStyleProp(node, 'text-decoration') === 'underline';
},
replacement: function (content, node, options) {
return '' + content + ''
}
}
rules.superscript = {
filter: 'sup',
replacement: function (content, node, options) {
return '' + content + ''
}
}
rules.subscript = {
filter: 'sub',
replacement: function (content, node, options) {
return '' + content + ''
}
}
// Handles foreground color changes as created by the rich text editor.
// We intentionally don't handle the general style="color: colorhere" case as
// this may leave unwanted formatting when saving websites as markdown.
rules.foregroundColor = {
filter: function (node, options) {
return options.preserveColorStyles && node.nodeName === 'SPAN' && getStyleProp(node, 'color');
},
replacement: function (content, node, options) {
return `${content}`;
},
}
// Converts placeholders for not-loaded resources.
rules.resourcePlaceholder = {
filter: function (node, options) {
if (!options.allowResourcePlaceholders) return false;
if (!node.classList || !node.classList.contains('not-loaded-resource')) return false;
const isImage = node.classList.contains('not-loaded-image-resource');
if (!isImage) return false;
const resourceId = node.getAttribute('data-resource-id');
return resourceId && resourceId.match(/^[a-z0-9]{32}$/);
},
replacement: function (_content, node) {
const htmlBefore = node.getAttribute('data-original-before') || '';
const htmlAfter = node.getAttribute('data-original-after') || '';
const isHtml = htmlBefore || htmlAfter;
const resourceId = node.getAttribute('data-resource-id');
if (isHtml) {
const attrs = [
htmlBefore.trim(),
`src=":/${resourceId}"`,
htmlAfter.trim(),
].filter(a => !!a);
return ``;
} else {
const originalAltText = node.getAttribute('data-original-alt') || '';
const title = node.getAttribute('data-original-title');
return imageMarkdownFromAttributes({
alt: originalAltText,
title,
src: `:/${resourceId}`,
});
}
}
}
// ==============================
// END Joplin format support
// ==============================
rules.blockquote = {
filter: 'blockquote',
replacement: function (content) {
content = content.replace(/^\n+|\n+$/g, '')
content = content.replace(/^/gm, '> ')
return '\n\n' + content + '\n\n'
}
}
rules.list = {
filter: ['ul', 'ol'],
replacement: function (content, node) {
var parent = node.parentNode
if (parent && isCodeBlock(parent) && node.classList && node.classList.contains('pre-numbering')){
// Ignore code-block children of type ul with class pre-numbering.
// See https://github.com/laurent22/joplin/pull/10126#discussion_r1532204251 .
// test case: packages/app-cli/tests/html_to_md/code_multiline_2.html
return '';
} else if (parent.nodeName === 'LI' && parent.lastElementChild === node) {
return '\n' + content
} else {
return '\n\n' + content + '\n\n'
}
}
}
// OL elements are ordered lists, but other elements with a "list-style-type: decimal" style
// should also be considered ordered lists, at least that's how they are rendered
// in browsers.
// https://developer.mozilla.org/en-US/docs/Web/CSS/list-style-type
function isOrderedList(e) {
if (e.nodeName === 'OL') return true;
return e.style && e.style.listStyleType === 'decimal';
}
rules.listItem = {
filter: 'li',
replacement: function (content, node, options) {
content = content
.replace(/^\n+/, '') // remove leading newlines
.replace(/\n+$/, '\n') // replace trailing newlines with just a single one
var prefix = options.bulletListMarker + ' '
if (node.isCode === false) {
content = content.replace(/\n/gm, '\n ') // indent
}
const joplinCheckbox = joplinCheckboxInfo(node);
if (joplinCheckbox) {
prefix = '- [' + (joplinCheckbox.checked ? 'x' : ' ') + '] ';
} else {
var parent = node.parentNode
if (isOrderedList(parent)) {
if (node.isCode) {
// Ordered lists in code blocks are often for line numbers. Remove them.
// See https://github.com/laurent22/joplin/pull/10126
// test case: packages/app-cli/tests/html_to_md/code_multiline_4.html
prefix = '';
} else {
var start = parent.getAttribute('start')
var index = Array.prototype.indexOf.call(parent.children, node)
var indexStr = (start ? Number(start) + index : index + 1) + ''
// The content of the line that contains the bullet must align wih the following lines.
//
// i.e it should be:
//
// 9. my content
// second line
// 10. next one
// second line
//
// But not:
//
// 9. my content
// second line
// 10. next one
// second line
//
prefix = indexStr + '.' + ' '.repeat(3 - indexStr.length)
}
}
}
return (
prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '')
)
}
}
rules.indentedCodeBlock = {
filter: function (node, options) {
if (options.codeBlockStyle !== 'indented') return false
return isCodeBlock(node);
},
replacement: function (content, node, options) {
const handledNode = isCodeBlockSpecialCase1(node) ? node : node.firstChild
return (
'\n\n ' +
handledNode.textContent.replace(/\n/g, '\n ') +
'\n\n'
)
}
}
rules.fencedCodeBlock = {
filter: function (node, options) {
if (options.codeBlockStyle !== 'fenced') return false;
return isCodeBlock(node);
},
replacement: function (content, node, options) {
let handledNode = node.firstChild;
if (isCodeBlockSpecialCase1(node) || isCodeBlockSpecialCase2(node)) handledNode = node;
var className = handledNode.className || ''
var language = (className.match(/language-(\S+)/) || [null, ''])[1]
var code = content
var fenceChar = options.fence.charAt(0)
var fenceSize = 3
var fenceInCodeRegex = new RegExp('^' + fenceChar + '{3,}', 'gm')
var match
while ((match = fenceInCodeRegex.exec(code))) {
if (match[0].length >= fenceSize) {
fenceSize = match[0].length + 1
}
}
var fence = repeat(fenceChar, fenceSize)
// remove code block leading and trailing empty lines
code = code.replace(/^([ \t]*\n)+/, '').trimEnd()
return (
'\n\n' + fence + language + '\n' +
code.replace(/\n$/, '') +
'\n' + fence + '\n\n'
)
}
}
rules.horizontalRule = {
filter: 'hr',
replacement: function (content, node, options) {
return '\n\n' + options.hr + '\n\n'
}
}
function filterLinkContent (content) {
return content.trim().replace(/[\n\r]+/g, ' ')
}
function filterLinkHref (href) {
if (!href) return ''
href = href.trim()
if (href.toLowerCase().indexOf('javascript:') === 0) return '' // We don't want to keep js code in the markdown
// Replace the spaces with %20 because otherwise they can cause problems for some
// renderer and space is not a valid URL character anyway.
href = href.replace(/ /g, '%20');
// Newlines and tabs also break renderers
href = href.replace(/\n/g, '%0A');
href = href.replace(/\t/g, '%09');
// Brackets also should be escaped
href = href.replace(/\(/g, '%28');
href = href.replace(/\)/g, '%29');
return href
}
function filterImageTitle(title) {
if (!title) return ''
title = title.trim()
title = title.replace(/\"/g, '"');
title = title.replace(/\(/g, '(');
title = title.replace(/\)/g, ')');
return title
}
function getNamedAnchorFromLink(node, options) {
var id = node.getAttribute('id')
if (!id) id = node.getAttribute('name')
if (id) id = id.trim();
if (id && options.anchorNames.indexOf(id.toLowerCase()) >= 0) {
return '';
} else {
return '';
}
}
function isLinkifiedUrl(url) {
return url.indexOf('http://') === 0 || url.indexOf('https://') === 0 || url.indexOf('file://') === 0;
}
rules.inlineLink = {
filter: function (node, options) {
return (
options.linkStyle === 'inlined' &&
node.nodeName === 'A' &&
(node.getAttribute('href') || node.getAttribute('name') || node.getAttribute('id'))
)
},
escapeContent: function (node, _options) {
// Disable escaping content (including '_'s) when the link has the same URL and href.
// This prevents links from being broken by added escapes.
return node.getAttribute('href') !== node.textContent;
},
replacement: function (content, node, options) {
var href = filterLinkHref(node.getAttribute('href'))
if (!href) {
return getNamedAnchorFromLink(node, options) + filterLinkContent(content)
} else {
var title = node.title && node.title !== href ? ' "' + node.title + '"' : ''
if (!href) title = ''
let output = getNamedAnchorFromLink(node, options) + '[' + filterLinkContent(content) + '](' + href + title + ')'
// If the URL is automatically linkified by Joplin, and the title is
// the same as the URL, there is no need to make it a link here. That
// will prevent URsL from the rich text editor to be needlessly
// converted from this:
//
// https://example.com
//
// to this:
//
// [https://example.com](https://example.com)
//
// It means cleaner Markdown will also be generated by the web
// clipper.
if (isLinkifiedUrl(href)) {
if (output === '[' + href + '](' + href + ')') return href;
}
return output;
}
}
}
// Normally a named anchor would be but
// you can also find Something so the
// rule below handle this.
// Fixes https://github.com/laurent22/joplin/issues/1876
rules.otherNamedAnchors = {
filter: function (node, options) {
return !!getNamedAnchorFromLink(node, options);
},
replacement: function (content, node, options) {
return getNamedAnchorFromLink(node, options) + content;
}
}
rules.referenceLink = {
filter: function (node, options) {
return (
options.linkStyle === 'referenced' &&
node.nodeName === 'A' &&
node.getAttribute('href')
)
},
replacement: function (content, node, options) {
var href = filterLinkHref(node.getAttribute('href'))
var title = node.title ? ' "' + node.title + '"' : ''
if (!href) title = ''
var replacement
var reference
content = filterLinkContent(content)
switch (options.linkReferenceStyle) {
case 'collapsed':
replacement = '[' + content + '][]'
reference = '[' + content + ']: ' + href + title
break
case 'shortcut':
replacement = '[' + content + ']'
reference = '[' + content + ']: ' + href + title
break
default:
var id = this.references.length + 1
replacement = '[' + content + '][' + id + ']'
reference = '[' + id + ']: ' + href + title
}
this.references.push(reference)
return replacement
},
references: [],
append: function (options) {
var references = ''
if (this.references.length) {
references = '\n\n' + this.references.join('\n') + '\n\n'
this.references = [] // Reset references
}
return references
}
}
rules.emphasis = {
filter: ['em', 'i'],
replacement: function (content, node, options) {
if (!content.trim()) return ''
if (node.isCode) return content;
return options.emDelimiter + content + options.emDelimiter
}
}
rules.strong = {
filter: ['strong', 'b'],
replacement: function (content, node, options) {
if (!content.trim()) return ''
if (node.isCode) return content;
return options.strongDelimiter + content + options.strongDelimiter
}
}
rules.code = {
filter: function (node) {
var hasSiblings = node.previousSibling || node.nextSibling
var isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings
return node.nodeName === 'CODE' && !isCodeBlock
},
replacement: function (content, node, options) {
if (!content) {
return ''
}
content = content.replace(/\r?\n|\r/g, '\n')
// If code is multiline and in codeBlock, just return it, codeBlock will add fence(default is ```).
//
// This handles the case where a element is nested directly within a
and
// should not be turned into an inline code region.
//
// See https://github.com/laurent22/joplin/pull/10126 .
if (content.indexOf('\n') !== -1 && node.parentNode && isCodeBlock(node.parentNode)){
return content
}
content = content.replace(/\r?\n|\r/g, '')
var extraSpace = /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : ''
var delimiter = '`'
var matches = content.match(/`+/gm) || []
while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`'
return delimiter + extraSpace + content + extraSpace + delimiter
}
}
function imageMarkdownFromAttributes(attributes) {
var alt = attributes.alt || ''
var src = filterLinkHref(attributes.src || '')
var title = attributes.title || ''
var titlePart = title ? ' "' + filterImageTitle(title) + '"' : ''
return src ? '![' + alt.replace(/([[\]])/g, '\\$1') + ']' + '(' + src + titlePart + ')' : ''
}
function imageMarkdownFromNode(node, options = null) {
options = Object.assign({}, {
preserveImageTagsWithSize: false,
}, options);
if (options.preserveImageTagsWithSize && (node.getAttribute('width') || node.getAttribute('height'))) {
let html = node.outerHTML;
// To prevent markup immediately after the image from being interpreted as HTML, a closing tag
// is sometimes necessary.
const needsClosingTag = () => {
const parent = node.parentElement;
if (!parent || parent.nodeName !== 'LI') return false;
const hasClosingTag = html.match(/<\/[a-z]+\/>$/ig);
if (hasClosingTag) {
return false;
}
const allChildren = [...parent.childNodes];
const nonEmptyChildren = allChildren.filter(item => {
// Even if surrounded by #text nodes that only contain whitespace, Markdown after
// an can still be incorrectly interpreted as HTML. Only non-empty #texts seem
// to prevent this.
return item.nodeName !== '#text' || item.textContent.trim() !== '';
});
const imageIndex = nonEmptyChildren.indexOf(node);
const hasNextSibling = imageIndex + 1 < nonEmptyChildren.length;
const nextSiblingName = hasNextSibling ? (
nonEmptyChildren[imageIndex + 1].nodeName
) : null;
const nextSiblingIsNewLine = nextSiblingName === 'UL' || nextSiblingName === 'OL' || nextSiblingName === 'BR';
return imageIndex === 0 && nextSiblingIsNewLine;
};
if (needsClosingTag()) {
html = html.replace(/[/]?>$/, `>${node.nodeName.toLowerCase()}>`);
}
return html;
}
return imageMarkdownFromAttributes({
alt: node.alt,
src: node.getAttribute('src'),
title: node.title,
});
}
function imageUrlFromSource(node) {
// Format of srcset can be:
// srcset="kitten.png"
// or:
// srcset="kitten.png, kitten@2X.png 2x"
let src = node.getAttribute('srcset');
if (!src) src = node.getAttribute('data-srcset');
if (!src) return '';
const s = src.split(',');
if (!s.length) return '';
src = s[0];
src = src.split(' ');
return src[0];
}
rules.image = {
filter: 'img',
replacement: function (content, node, options) {
return imageMarkdownFromNode(node, options);
}
}
rules.picture = {
filter: 'picture',
replacement: function (content, node, options) {
if (!node.childNodes) return '';
let firstSource = null;
let firstImg = null;
for (let i = 0; i < node.childNodes.length; i++) {
const child = node.childNodes[i];
if (child.nodeName === 'SOURCE' && !firstSource) firstSource = child;
if (child.nodeName === 'IMG') firstImg = child;
}
if (firstImg && firstImg.getAttribute('src')) {
return imageMarkdownFromNode(firstImg, options);
} else if (firstSource) {
// A