From 590769b1ae1a8a41aa549c34f3bb864fa9900d8e Mon Sep 17 00:00:00 2001 From: Laurent Cozic Date: Tue, 26 Dec 2023 11:19:29 +0000 Subject: [PATCH] Desktop, Cli: Resolves #9596: Restore note links after importing an ENEX file --- .../tests/enex_to_md/linked_notes.enex | 49 ++++++++++++++ packages/lib/import-enex-md-gen.test.ts | 13 ++++ packages/lib/import-enex-md-gen.ts | 2 + packages/lib/import-enex.ts | 43 ++++++++++-- packages/lib/markdownUtils.test.ts | 1 - packages/lib/markdownUtils.ts | 2 +- packages/lib/package.json | 1 + packages/utils/html.test.ts | 60 +++++++++++++++++ packages/utils/html.ts | 40 ++++++++++- packages/utils/jest.config.js | 2 + packages/utils/jest.setup.js | 1 + packages/utils/markdown.test.ts | 55 +++++++++++++++ packages/utils/markdown.ts | 67 +++++++++++++++++++ packages/utils/package.json | 8 ++- packages/utils/types.ts | 7 +- readme/apps/import_export.md | 4 +- yarn.lock | 4 ++ 17 files changed, 345 insertions(+), 14 deletions(-) create mode 100644 packages/app-cli/tests/enex_to_md/linked_notes.enex create mode 100644 packages/utils/html.test.ts create mode 100644 packages/utils/jest.setup.js create mode 100644 packages/utils/markdown.test.ts create mode 100644 packages/utils/markdown.ts diff --git a/packages/app-cli/tests/enex_to_md/linked_notes.enex b/packages/app-cli/tests/enex_to_md/linked_notes.enex new file mode 100644 index 0000000000..71713f6664 --- /dev/null +++ b/packages/app-cli/tests/enex_to_md/linked_notes.enex @@ -0,0 +1,49 @@ + + + + + Note 1 + 20160730T164129Z + 20231224T151443Z + + + ]]> + + + + Note 2 + 20160730T111759Z + 20160730T111807Z + + +
Testing
]]> +
+
+ + Note 3 + 20160730T111759Z + 20160730T111807Z + + + ]]> + + + + Ambiguous note + 20160730T111759Z + 20160730T111807Z + + +
Testing
]]> +
+
+ + Ambiguous note + 20160730T111759Z + 20160730T111807Z + + +
Testing
]]> +
+
+
diff --git a/packages/lib/import-enex-md-gen.test.ts b/packages/lib/import-enex-md-gen.test.ts index c90a3f94cf..705ea085eb 100644 --- a/packages/lib/import-enex-md-gen.test.ts +++ b/packages/lib/import-enex-md-gen.test.ts @@ -214,4 +214,17 @@ describe('import-enex-md-gen', () => { expect(resource.title).toBe('08.06.2014 16:58:55'); }); + it('should resolve note links', async () => { + await importEnexFile('linked_notes.enex'); + const notes: NoteEntity[] = await Note.all(); + + const note1 = notes.find(n => n.title === 'Note 1'); + const note2 = notes.find(n => n.title === 'Note 2'); + const note3 = notes.find(n => n.title === 'Note 3'); + + expect(notes.length).toBe(5); + expect(note1.body).toBe(`[Note 2](:/${note2.id})[Note 3](:/${note3.id})`); + expect(note3.body).toBe('[Ambiguous note](evernote:///view/5223870/s49/9cd5e810-fa03-429a-8194-ab847f2f1ab2/c99d9e01-ca35-4c75-ba63-f0c0ef97787d/)'); + }); + }); diff --git a/packages/lib/import-enex-md-gen.ts b/packages/lib/import-enex-md-gen.ts index 5440e568e6..20779b4265 100644 --- a/packages/lib/import-enex-md-gen.ts +++ b/packages/lib/import-enex-md-gen.ts @@ -58,6 +58,7 @@ interface ParserState { spanAttributes: string[]; tags: ParserStateTag[]; currentCode?: string; + evernoteLinkTitles: Record; } @@ -607,6 +608,7 @@ function enexXmlToMdArray(stream: any, resources: ResourceEntity[], tasks: Extra anchorAttributes: [], spanAttributes: [], tags: [], + evernoteLinkTitles: {}, }; const options = {}; diff --git a/packages/lib/import-enex.ts b/packages/lib/import-enex.ts index 19110c9359..d40212db1b 100644 --- a/packages/lib/import-enex.ts +++ b/packages/lib/import-enex.ts @@ -10,6 +10,8 @@ import { NoteEntity, ResourceEntity } from './services/database/types'; import { enexXmlToMd } from './import-enex-md-gen'; import { MarkupToHtml } from '@joplin/renderer'; import { fileExtension, friendlySafeFilename, safeFileExtension } from './path-utils'; +import { extractUrls as extractUrlsFromHtml } from '@joplin/utils/html'; +import { extractUrls as extractUrlsFromMarkdown } from '@joplin/utils/markdown'; const moment = require('moment'); const { wrapError } = require('./errorUtils'); const { enexXmlToHtml } = require('./import-enex-html-gen.js'); @@ -435,6 +437,15 @@ export default async function importEnex(parentFolderId: string, filePath: strin processingNotes = true; stream.pause(); + // Set the note ID so that we can create a title-to-id map, which + // will be needed to recreate the note links below. + const noteTitleToId: Record = {}; + for (const note of notes) { + if (!noteTitleToId[note.title]) noteTitleToId[note.title] = []; + note.id = uuid.create(); + noteTitleToId[note.title].push(note.id); + } + while (notes.length) { const note = notes.shift(); @@ -452,20 +463,40 @@ export default async function importEnex(parentFolderId: string, filePath: strin note.resources[i] = resource; } - const body = importOptions.outputFormat === 'html' ? + // -------------------------------------------------------- + // Convert the ENEX body to either Markdown or HTML + // -------------------------------------------------------- + + let body: string = importOptions.outputFormat === 'html' ? await enexXmlToHtml(note.bodyXml, note.resources) : await enexXmlToMd(note.bodyXml, note.resources, note.tasks); delete note.bodyXml; + // -------------------------------------------------------- + // Convert the Evernote note links to Joplin note links. If + // we don't find a matching note, or if there are multiple + // matching notes, we leave the Evernote links as is. + // -------------------------------------------------------- + + const links = importOptions.outputFormat === 'html' ? + extractUrlsFromHtml(body) : + extractUrlsFromMarkdown(body); + + for (const link of links) { + const matchingNoteIds = noteTitleToId[link.title]; + if (matchingNoteIds && matchingNoteIds.length === 1) { + body = body.replace(link.url, `:/${matchingNoteIds[0]}`); + } + } + + // -------------------------------------------------------- + // Finish setting up the note + // -------------------------------------------------------- + note.markup_language = importOptions.outputFormat === 'html' ? MarkupToHtml.MARKUP_LANGUAGE_HTML : MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN; - // console.info('*************************************************************************'); - // console.info(body); - // console.info('*************************************************************************'); - - note.id = uuid.create(); note.parent_id = parentFolderId; note.body = body; diff --git a/packages/lib/markdownUtils.test.ts b/packages/lib/markdownUtils.test.ts index a6e9223cd9..bf5545a8dd 100644 --- a/packages/lib/markdownUtils.test.ts +++ b/packages/lib/markdownUtils.test.ts @@ -90,5 +90,4 @@ describe('Should detect list items', () => { test('should NOT detect `+ [x]` as empty list item ', () => { expect(markdownUtils.isEmptyListItem('+ [x]')).toBe(false); }); - }); diff --git a/packages/lib/markdownUtils.ts b/packages/lib/markdownUtils.ts index fe8c8975d5..9cbcb624e5 100644 --- a/packages/lib/markdownUtils.ts +++ b/packages/lib/markdownUtils.ts @@ -1,7 +1,7 @@ import { validateLinks } from '@joplin/renderer'; const stringPadding = require('string-padding'); const urlUtils = require('./urlUtils'); -const MarkdownIt = require('markdown-it'); +import * as MarkdownIt from 'markdown-it'; // Taken from codemirror/addon/edit/continuelist.js const listRegex = /^(\s*)([*+-] \[[x ]\]\s|[*+-]\s|(\d+)([.)]\s))(\s*)/; diff --git a/packages/lib/package.json b/packages/lib/package.json index e9be04a11f..150ca5868e 100644 --- a/packages/lib/package.json +++ b/packages/lib/package.json @@ -19,6 +19,7 @@ "@types/fs-extra": "11.0.4", "@types/jest": "29.5.8", "@types/js-yaml": "4.0.9", + "@types/markdown-it": "13.0.7", "@types/node": "18.18.14", "@types/node-rsa": "1.1.4", "@types/react": "18.2.41", diff --git a/packages/utils/html.test.ts b/packages/utils/html.test.ts new file mode 100644 index 0000000000..1a94e22977 --- /dev/null +++ b/packages/utils/html.test.ts @@ -0,0 +1,60 @@ +import { extractUrls } from './html'; +import { Link } from './types'; + +describe('htmlUtils', () => { + + test.each([ + [ + '', + [], + ], + [ + 'bla >Testing no link"', + [], + ], + [ + 'bla Testing link"', + [ + { + url: 'https://example.com', + title: 'Testing link', + }, + ], + ], + [ + 'Test 1 Test 2', + [ + { + url: '#', + title: 'Test 1', + }, + { + url: '', + title: 'Test 2', + }, + ], + ], + [ + '', + [ + { + url: 'https://example.com', + title: '', + }, + ], + ], + [ + 'check & encoding', + [ + { + url: '#', + title: 'check & encoding', + }, + ], + ], + ])('should retrieve links', (html: string, expected: Link[]) => { + const actual = extractUrls(html); + expect(actual).toEqual(expected); + }); + +}); diff --git a/packages/utils/html.ts b/packages/utils/html.ts index 946bebdec4..2343ec99ea 100644 --- a/packages/utils/html.ts +++ b/packages/utils/html.ts @@ -1,6 +1,7 @@ -/* eslint-disable import/prefer-default-export */ +import { Link } from './types'; const Entities = require('html-entities').AllHtmlEntities; +const htmlparser2 = require('@joplin/fork-htmlparser2'); const selfClosingElements = [ 'area', @@ -40,3 +41,40 @@ export const attributesHtml = (attr: Record) => { export const isSelfClosingTag = (tagName: string) => { return selfClosingElements.includes(tagName.toLowerCase()); }; + +export const extractUrls = (html: string) => { + if (!html || !html.trim()) return []; + + const output: Link[] = []; + let currentLink: Link|null = null; + + const parser = new htmlparser2.Parser({ + + onopentag: (name: string, attrs: Record) => { + if (name === 'a') { + currentLink = { + url: attrs && attrs.href ? attrs.href : '', + title: '', + }; + } + }, + + ontext: (text: string) => { + if (currentLink) currentLink.title += text; + }, + + onclosetag: (name: string) => { + if (name === 'a') { + if (!currentLink) throw new Error('Found a closing anchor tag without an opening one'); + output.push(currentLink); + currentLink = null; + } + }, + + }, { decodeEntities: true }); + + parser.write(html); + parser.end(); + + return output; +}; diff --git a/packages/utils/jest.config.js b/packages/utils/jest.config.js index 829416779c..fce58ebe25 100644 --- a/packages/utils/jest.config.js +++ b/packages/utils/jest.config.js @@ -16,4 +16,6 @@ module.exports = { testPathIgnorePatterns: ['/node_modules/'], slowTestThreshold: 40, + + setupFilesAfterEnv: [`${__dirname}/jest.setup.js`], }; diff --git a/packages/utils/jest.setup.js b/packages/utils/jest.setup.js new file mode 100644 index 0000000000..870086a4dc --- /dev/null +++ b/packages/utils/jest.setup.js @@ -0,0 +1 @@ +require('../../jest.base-setup.js')(); diff --git a/packages/utils/markdown.test.ts b/packages/utils/markdown.test.ts new file mode 100644 index 0000000000..75026ce13e --- /dev/null +++ b/packages/utils/markdown.test.ts @@ -0,0 +1,55 @@ +import { extractUrls } from './markdown'; +import { Link } from './types'; + +describe('markdown', () => { + + test.each([ + [ + '', + [], + ], + [ + 'Some text and no links', + [], + ], + [ + '[](https://example.com)', + [ + { + url: 'https://example.com', + title: '', + }, + ], + ], + [ + 'before [testing](https://example.com) [testing **with bold**](https://example2.com) after', + [ + { + url: 'https://example.com', + title: 'testing', + }, + { + url: 'https://example2.com', + title: 'testing with bold', + }, + ], + ], + [ + '[Testing MD](https://example.com/md) Testing HTML', + [ + { + url: 'https://example.com/md', + title: 'Testing MD', + }, + { + url: 'https://example.com/html', + title: 'Testing HTML', + }, + ], + ], + ])('should extract URLs', (md: string, expected: Link[]) => { + const actual = extractUrls(md); + expect(actual).toEqual(expected); + }); + +}); diff --git a/packages/utils/markdown.ts b/packages/utils/markdown.ts new file mode 100644 index 0000000000..9f194794d1 --- /dev/null +++ b/packages/utils/markdown.ts @@ -0,0 +1,67 @@ +/* eslint-disable import/prefer-default-export */ + +import * as MarkdownIt from 'markdown-it'; +import { Link } from './types'; + +// enable file link URLs in MarkdownIt. Keeps other URL restrictions of +// MarkdownIt untouched. Format [link name](file://...) +const validateLinks = (url: string) => { + const BAD_PROTO_RE = /^(vbscript|javascript|data):/; + const GOOD_DATA_RE = /^data:image\/(gif|png|jpeg|webp);/; + + // url should be normalized at this point, and existing entities are decoded + const str = url.trim().toLowerCase(); + + if (str.indexOf('data:image/svg+xml,') === 0) { + return true; + } + + return BAD_PROTO_RE.test(str) ? (!!GOOD_DATA_RE.test(str)) : true; +}; + +// Note that the title is stripped of any Markdown code. So `title with +// **bold**` will become `title with bold`. Links are extracted both from +// Markdown and from HTML links. +export const extractUrls = (md: string): Link[] => { + const markdownIt = new MarkdownIt(); + markdownIt.validateLink = validateLinks; // Necessary to support file:/// links + + const env = {}; + const tokens = markdownIt.parse(md, env); + const output: Link[] = []; + + const searchUrls = (tokens: MarkdownIt.Token[], currentLink: Link|null) => { + for (let i = 0; i < tokens.length; i++) { + const token = tokens[i]; + if (token.type === 'link_open') { + currentLink = { + title: '', + url: token.attrGet('href') || '', + }; + } else if (token.type === 'link_close') { + if (!currentLink) throw new Error('Found a link_close without a link_open'); + output.push(currentLink); + currentLink = null; + } else if (token.children && token.children.length) { + searchUrls(token.children, currentLink); + } else if (token.type === 'text' && currentLink) { + currentLink.title += token.content; + } + } + }; + + searchUrls(tokens, null); + + // Definitely won't work in all cases but for our particular use case, + // processing Markdown generated from ENEX documents, that should be enough. + const htmlAnchorRegex = /(.*?)<\/a>/ig; + let result; + while ((result = htmlAnchorRegex.exec(md)) !== null) { + output.push({ + url: result[1], + title: result[2], + }); + } + + return output; +}; diff --git a/packages/utils/package.json b/packages/utils/package.json index 2a5a21f32c..8bcd38a57c 100644 --- a/packages/utils/package.json +++ b/packages/utils/package.json @@ -8,11 +8,12 @@ "./env": "./dist/env.js", "./fs": "./dist/fs.js", "./html": "./dist/html.js", + "./Logger": "./dist/Logger.js", + "./markdown": "./dist/markdown.js", "./net": "./dist/net.js", "./time": "./dist/time.js", "./types": "./dist/types.js", - "./url": "./dist/url.js", - "./Logger": "./dist/Logger.js" + "./url": "./dist/url.js" }, "publishConfig": { "access": "public" @@ -27,11 +28,13 @@ "author": "", "license": "AGPL-3.0-or-later", "dependencies": { + "@joplin/fork-htmlparser2": "^4.1.50", "async-mutex": "0.4.0", "execa": "5.1.1", "fs-extra": "11.1.1", "glob": "10.3.10", "html-entities": "1.4.0", + "markdown-it": "13.0.2", "moment": "2.29.4", "node-fetch": "2.6.7", "sprintf-js": "1.1.3" @@ -39,6 +42,7 @@ "devDependencies": { "@types/fs-extra": "11.0.4", "@types/jest": "29.5.8", + "@types/markdown-it": "13.0.7", "@types/node-fetch": "2.6.9", "jest": "29.7.0", "ts-jest": "29.1.1" diff --git a/packages/utils/types.ts b/packages/utils/types.ts index 61a01d5eb4..645f1a6b28 100644 --- a/packages/utils/types.ts +++ b/packages/utils/types.ts @@ -1,6 +1,9 @@ -/* eslint-disable import/prefer-default-export */ - export interface Size { width?: number; height?: number; } + +export interface Link { + title: string; + url: string; +} diff --git a/readme/apps/import_export.md b/readme/apps/import_export.md index e90963f182..0bbb367a63 100644 --- a/readme/apps/import_export.md +++ b/readme/apps/import_export.md @@ -6,10 +6,12 @@ Joplin was designed as a replacement for Evernote and so can import complete Evernote notebooks, as well as notes, tags, resources (attached files) and note metadata (such as author, geo-location, etc.) via ENEX files. In terms of data, the only two things that might slightly differ are: -- Recognition data - Evernote images, in particular scanned (or photographed) documents have [recognition data](https://en.wikipedia.org/wiki/Optical_character_recognition) associated with them. It is the text that Evernote has been able to recognise in the document. This data is not preserved when the note are imported into Joplin. However, should it become supported in the search tool or other parts of Joplin, it should be possible to regenerate this recognition data since the actual image would still be available. +- Recognition data - Evernote images, in particular scanned (or photographed) documents have [recognition data](https://en.wikipedia.org/wiki/Optical_character_recognition) associated with them. It is the text that Evernote has been able to recognise in the document. This data is not preserved when the note are imported into Joplin. However, if you have enabled OCR in Joplin, that recognition data will be recreated in a format compatible with Joplin. - Colour, font sizes and faces - Evernote text is stored as HTML and this is converted to Markdown during the import process. For notes that are mostly plain text or with basic formatting (bold, italic, bullet points, links, etc.) this is a lossless conversion, and the note, once rendered back to HTML should be very similar. Tables are also imported and converted to Markdown tables. For very complex notes, some formatting data might be lost - in particular colours, font sizes and font faces will not be imported. The text itself however is always imported in full regardless of formatting. If it is essential that this extra data is preserved then Joplin also allows import of ENEX files as HTML. +- Links between notes are mostly preserved. However the ENEX format does not include all the necessary information to find out what the target of a link is (specifically, Evernote use an ID for the link but that ID is not associated with the target note). Instead Joplin tries to guess what note is linked based on the note title, which mostly works, but not always - for example if multiple notes have the same title, or if the link title is different from the target note title. If Joplin cannot guess how to restore the link, the Evernote link will remain. + To import Evernote data, first export your Evernote notebooks to ENEX files as described [here](https://help.evernote.com/hc/en-us/articles/209005557-How-to-back-up-export-and-restore-import-notes-and-notebooks). Then follow these steps: In the **desktop application**, open File > Import > ENEX and select your file. The notes will be imported into a new separate notebook. If needed they can then be moved to a different notebook, or the notebook can be renamed, etc. diff --git a/yarn.lock b/yarn.lock index cf2dc4d0c6..01d59545c6 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6833,6 +6833,7 @@ __metadata: "@types/fs-extra": 11.0.4 "@types/jest": 29.5.8 "@types/js-yaml": 4.0.9 + "@types/markdown-it": 13.0.7 "@types/nanoid": 3.0.0 "@types/node": 18.18.14 "@types/node-rsa": 1.1.4 @@ -7185,8 +7186,10 @@ __metadata: version: 0.0.0-use.local resolution: "@joplin/utils@workspace:packages/utils" dependencies: + "@joplin/fork-htmlparser2": ^4.1.50 "@types/fs-extra": 11.0.4 "@types/jest": 29.5.8 + "@types/markdown-it": 13.0.7 "@types/node-fetch": 2.6.9 async-mutex: 0.4.0 execa: 5.1.1 @@ -7194,6 +7197,7 @@ __metadata: glob: 10.3.10 html-entities: 1.4.0 jest: 29.7.0 + markdown-it: 13.0.2 moment: 2.29.4 node-fetch: 2.6.7 sprintf-js: 1.1.3