Desktop, Cli: Resolves #9596: Restore note links after importing an ENEX file

2024-12-24 10:27:10 +02:00 · 2023-12-26 11:19:29 +00:00 · 2023-12-26 11:19:29 +00:00 · 590769b1ae
commit 590769b1ae
parent 0873b1900b
17 changed files with 345 additions and 14 deletions
--- a/packages/app-cli/tests/enex_to_md/linked_notes.enex
+++ b/packages/app-cli/tests/enex_to_md/linked_notes.enex
@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
+<en-export export-date="20231224T151504Z" application="Evernote" version="10.68.2">
+  <note>
+    <title>Note 1</title>
+    <created>20160730T164129Z</created>
+    <updated>20231224T151443Z</updated>
+    <content>
+      <![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div><a href="evernote:///view/5223870/s49/9cd5e810-fa03-429a-8194-ab847f2f1ab2/c99d9e01-ca35-4c75-ba63-f0c0ef97787d/" rel="noopener noreferrer" rev="en_rl_none">Note 2</a><a href="evernote:///view/5223870/s49/9cd5e810-fa03-429a-8194-ab847f2f1ab2/c99d9e01-ca35-4c75-ba63-f0c0ef97787d/" rel="noopener noreferrer" rev="en_rl_none">Note 3</a></div></en-note>      ]]>
+    </content>
+  </note>
+  <note>
+    <title>Note 2</title>
+    <created>20160730T111759Z</created>
+    <updated>20160730T111807Z</updated>
+    <content>
+      <![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>Testing</div></en-note>      ]]>
+    </content>
+  </note>
+  <note>
+    <title>Note 3</title>
+    <created>20160730T111759Z</created>
+    <updated>20160730T111807Z</updated>
+    <content>
+      <![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div><a href="evernote:///view/5223870/s49/9cd5e810-fa03-429a-8194-ab847f2f1ab2/c99d9e01-ca35-4c75-ba63-f0c0ef97787d/" rel="noopener noreferrer" rev="en_rl_none">Ambiguous note</a></div></en-note>      ]]>
+    </content>
+  </note>
+  <note>
+    <title>Ambiguous note</title>
+    <created>20160730T111759Z</created>
+    <updated>20160730T111807Z</updated>
+    <content>
+      <![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>Testing</div></en-note>      ]]>
+    </content>
+  </note>
+  <note>
+    <title>Ambiguous note</title>
+    <created>20160730T111759Z</created>
+    <updated>20160730T111807Z</updated>
+    <content>
+      <![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>Testing</div></en-note>      ]]>
+    </content>
+  </note>
+</en-export>
--- a/packages/lib/import-enex-md-gen.test.ts
+++ b/packages/lib/import-enex-md-gen.test.ts
@ -214,4 +214,17 @@ describe('import-enex-md-gen', () => {
 		expect(resource.title).toBe('08.06.2014 16:58:55');
 	});

+	it('should resolve note links', async () => {
+		await importEnexFile('linked_notes.enex');
+		const notes: NoteEntity[] = await Note.all();
+
+		const note1 = notes.find(n => n.title === 'Note 1');
+		const note2 = notes.find(n => n.title === 'Note 2');
+		const note3 = notes.find(n => n.title === 'Note 3');
+
+		expect(notes.length).toBe(5);
+		expect(note1.body).toBe(`[Note 2](:/${note2.id})[Note 3](:/${note3.id})`);
+		expect(note3.body).toBe('[Ambiguous note](evernote:///view/5223870/s49/9cd5e810-fa03-429a-8194-ab847f2f1ab2/c99d9e01-ca35-4c75-ba63-f0c0ef97787d/)');
+	});
+
 });
--- a/packages/lib/import-enex-md-gen.ts
+++ b/packages/lib/import-enex-md-gen.ts
@ -58,6 +58,7 @@ interface ParserState {
 	spanAttributes: string[];
 	tags: ParserStateTag[];
 	currentCode?: string;
+	evernoteLinkTitles: Record<string, string>;
 }


@ -607,6 +608,7 @@ function enexXmlToMdArray(stream: any, resources: ResourceEntity[], tasks: Extra
 			anchorAttributes: [],
 			spanAttributes: [],
 			tags: [],
+			evernoteLinkTitles: {},
 		};

 		const options = {};
--- a/packages/lib/import-enex.ts
+++ b/packages/lib/import-enex.ts
@ -10,6 +10,8 @@ import { NoteEntity, ResourceEntity } from './services/database/types';
 import { enexXmlToMd } from './import-enex-md-gen';
 import { MarkupToHtml } from '@joplin/renderer';
 import { fileExtension, friendlySafeFilename, safeFileExtension } from './path-utils';
+import { extractUrls as extractUrlsFromHtml } from '@joplin/utils/html';
+import { extractUrls as extractUrlsFromMarkdown } from '@joplin/utils/markdown';
 const moment = require('moment');
 const { wrapError } = require('./errorUtils');
 const { enexXmlToHtml } = require('./import-enex-html-gen.js');
@ -435,6 +437,15 @@ export default async function importEnex(parentFolderId: string, filePath: strin
 			processingNotes = true;
 			stream.pause();

+			// Set the note ID so that we can create a title-to-id map, which
+			// will be needed to recreate the note links below.
+			const noteTitleToId: Record<string, string[]> = {};
+			for (const note of notes) {
+				if (!noteTitleToId[note.title]) noteTitleToId[note.title] = [];
+				note.id = uuid.create();
+				noteTitleToId[note.title].push(note.id);
+			}
+
 			while (notes.length) {
 				const note = notes.shift();

@ -452,20 +463,40 @@ export default async function importEnex(parentFolderId: string, filePath: strin
 						note.resources[i] = resource;
 					}

-					const body = importOptions.outputFormat === 'html' ?
+					// --------------------------------------------------------
+					// Convert the ENEX body to either Markdown or HTML
+					// --------------------------------------------------------
+
+					let body: string = importOptions.outputFormat === 'html' ?
 						await enexXmlToHtml(note.bodyXml, note.resources) :
 						await enexXmlToMd(note.bodyXml, note.resources, note.tasks);
 					delete note.bodyXml;

+					// --------------------------------------------------------
+					// Convert the Evernote note links to Joplin note links. If
+					// we don't find a matching note, or if there are multiple
+					// matching notes, we leave the Evernote links as is.
+					// --------------------------------------------------------
+
+					const links = importOptions.outputFormat === 'html' ?
+						extractUrlsFromHtml(body) :
+						extractUrlsFromMarkdown(body);
+
+					for (const link of links) {
+						const matchingNoteIds = noteTitleToId[link.title];
+						if (matchingNoteIds && matchingNoteIds.length === 1) {
+							body = body.replace(link.url, `:/${matchingNoteIds[0]}`);
+						}
+					}
+
+					// --------------------------------------------------------
+					// Finish setting up the note
+					// --------------------------------------------------------
+
 					note.markup_language = importOptions.outputFormat === 'html' ?
 						MarkupToHtml.MARKUP_LANGUAGE_HTML :
 						MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;

-					// console.info('*************************************************************************');
-					// console.info(body);
-					// console.info('*************************************************************************');
-
-					note.id = uuid.create();
 					note.parent_id = parentFolderId;
 					note.body = body;

--- a/packages/lib/markdownUtils.test.ts
+++ b/packages/lib/markdownUtils.test.ts
@ -90,5 +90,4 @@ describe('Should detect list items', () => {
 	test('should NOT detect `+ [x]` as empty list item ', () => {
 		expect(markdownUtils.isEmptyListItem('+ [x]')).toBe(false);
 	});
-
 });
--- a/packages/lib/markdownUtils.ts
+++ b/packages/lib/markdownUtils.ts
@ -1,7 +1,7 @@
 import { validateLinks } from '@joplin/renderer';
 const stringPadding = require('string-padding');
 const urlUtils = require('./urlUtils');
-const MarkdownIt = require('markdown-it');
+import * as MarkdownIt from 'markdown-it';

 // Taken from codemirror/addon/edit/continuelist.js
 const listRegex = /^(\s*)([*+-] \[[x ]\]\s|[*+-]\s|(\d+)([.)]\s))(\s*)/;
--- a/packages/lib/package.json
+++ b/packages/lib/package.json
@ -19,6 +19,7 @@
    "@types/fs-extra": "11.0.4",
    "@types/jest": "29.5.8",
    "@types/js-yaml": "4.0.9",
+    "@types/markdown-it": "13.0.7",
    "@types/node": "18.18.14",
    "@types/node-rsa": "1.1.4",
    "@types/react": "18.2.41",
--- a/packages/utils/html.test.ts
+++ b/packages/utils/html.test.ts
@ -0,0 +1,60 @@
+import { extractUrls } from './html';
+import { Link } from './types';
+
+describe('htmlUtils', () => {
+
+	test.each([
+		[
+			'',
+			[],
+		],
+		[
+			'bla >Testing <b>no link</b>"',
+			[],
+		],
+		[
+			'bla <a href="https://example.com">Testing <b>link</b></a>"',
+			[
+				{
+					url: 'https://example.com',
+					title: 'Testing link',
+				},
+			],
+		],
+		[
+			'<a href="#">Test 1</a> <a onclick="">Test 2</a>',
+			[
+				{
+					url: '#',
+					title: 'Test 1',
+				},
+				{
+					url: '',
+					title: 'Test 2',
+				},
+			],
+		],
+		[
+			'<a href="https://example.com"><img src="https://test.com/image.png"/></a>',
+			[
+				{
+					url: 'https://example.com',
+					title: '',
+				},
+			],
+		],
+		[
+			'<a href="#">check &amp; encoding</a>',
+			[
+				{
+					url: '#',
+					title: 'check & encoding',
+				},
+			],
+		],
+	])('should retrieve links', (html: string, expected: Link[]) => {
+		const actual = extractUrls(html);
+		expect(actual).toEqual(expected);
+	});
+
+});
--- a/packages/utils/html.ts
+++ b/packages/utils/html.ts
@ -1,6 +1,7 @@
-/* eslint-disable import/prefer-default-export */
+import { Link } from './types';

 const Entities = require('html-entities').AllHtmlEntities;
+const htmlparser2 = require('@joplin/fork-htmlparser2');

 const selfClosingElements = [
 	'area',
@ -40,3 +41,40 @@ export const attributesHtml = (attr: Record<string, any>) => {
 export const isSelfClosingTag = (tagName: string) => {
 	return selfClosingElements.includes(tagName.toLowerCase());
 };
+
+export const extractUrls = (html: string) => {
+	if (!html || !html.trim()) return [];
+
+	const output: Link[] = [];
+	let currentLink: Link|null = null;
+
+	const parser = new htmlparser2.Parser({
+
+		onopentag: (name: string, attrs: Record<string, string>) => {
+			if (name === 'a') {
+				currentLink = {
+					url: attrs && attrs.href ? attrs.href : '',
+					title: '',
+				};
+			}
+		},
+
+		ontext: (text: string) => {
+			if (currentLink) currentLink.title += text;
+		},
+
+		onclosetag: (name: string) => {
+			if (name === 'a') {
+				if (!currentLink) throw new Error('Found a closing anchor tag without an opening one');
+				output.push(currentLink);
+				currentLink = null;
+			}
+		},
+
+	}, { decodeEntities: true });
+
+	parser.write(html);
+	parser.end();
+
+	return output;
+};
--- a/packages/utils/jest.config.js
+++ b/packages/utils/jest.config.js
@ -16,4 +16,6 @@ module.exports = {
 	testPathIgnorePatterns: ['<rootDir>/node_modules/'],

 	slowTestThreshold: 40,
+
+	setupFilesAfterEnv: [`${__dirname}/jest.setup.js`],
 };
--- a/packages/utils/jest.setup.js
+++ b/packages/utils/jest.setup.js
@ -0,0 +1 @@
+require('../../jest.base-setup.js')();
--- a/packages/utils/markdown.test.ts
+++ b/packages/utils/markdown.test.ts
@ -0,0 +1,55 @@
+import { extractUrls } from './markdown';
+import { Link } from './types';
+
+describe('markdown', () => {
+
+	test.each([
+		[
+			'',
+			[],
+		],
+		[
+			'Some text and no links',
+			[],
+		],
+		[
+			'[](https://example.com)',
+			[
+				{
+					url: 'https://example.com',
+					title: '',
+				},
+			],
+		],
+		[
+			'before [testing](https://example.com) [testing **with bold**](https://example2.com) after',
+			[
+				{
+					url: 'https://example.com',
+					title: 'testing',
+				},
+				{
+					url: 'https://example2.com',
+					title: 'testing with bold',
+				},
+			],
+		],
+		[
+			'[Testing MD](https://example.com/md) <a href="https://example.com/html">Testing HTML</a>',
+			[
+				{
+					url: 'https://example.com/md',
+					title: 'Testing MD',
+				},
+				{
+					url: 'https://example.com/html',
+					title: 'Testing HTML',
+				},
+			],
+		],
+	])('should extract URLs', (md: string, expected: Link[]) => {
+		const actual = extractUrls(md);
+		expect(actual).toEqual(expected);
+	});
+
+});
--- a/packages/utils/markdown.ts
+++ b/packages/utils/markdown.ts
@ -0,0 +1,67 @@
+/* eslint-disable import/prefer-default-export */
+
+import * as MarkdownIt from 'markdown-it';
+import { Link } from './types';
+
+// enable file link URLs in MarkdownIt. Keeps other URL restrictions of
+// MarkdownIt untouched. Format [link name](file://...)
+const validateLinks = (url: string) => {
+	const BAD_PROTO_RE = /^(vbscript|javascript|data):/;
+	const GOOD_DATA_RE = /^data:image\/(gif|png|jpeg|webp);/;
+
+	// url should be normalized at this point, and existing entities are decoded
+	const str = url.trim().toLowerCase();
+
+	if (str.indexOf('data:image/svg+xml,') === 0) {
+		return true;
+	}
+
+	return BAD_PROTO_RE.test(str) ? (!!GOOD_DATA_RE.test(str)) : true;
+};
+
+// Note that the title is stripped of any Markdown code. So `title with
+// **bold**` will become `title with bold`. Links are extracted both from
+// Markdown and from HTML links.
+export const extractUrls = (md: string): Link[] => {
+	const markdownIt = new MarkdownIt();
+	markdownIt.validateLink = validateLinks; // Necessary to support file:/// links
+
+	const env = {};
+	const tokens = markdownIt.parse(md, env);
+	const output: Link[] = [];
+
+	const searchUrls = (tokens: MarkdownIt.Token[], currentLink: Link|null) => {
+		for (let i = 0; i < tokens.length; i++) {
+			const token = tokens[i];
+			if (token.type === 'link_open') {
+				currentLink = {
+					title: '',
+					url: token.attrGet('href') || '',
+				};
+			} else if (token.type === 'link_close') {
+				if (!currentLink) throw new Error('Found a link_close without a link_open');
+				output.push(currentLink);
+				currentLink = null;
+			} else if (token.children && token.children.length) {
+				searchUrls(token.children, currentLink);
+			} else if (token.type === 'text' && currentLink) {
+				currentLink.title += token.content;
+			}
+		}
+	};
+
+	searchUrls(tokens, null);
+
+	// Definitely won't work in all cases but for our particular use case,
+	// processing Markdown generated from ENEX documents, that should be enough.
+	const htmlAnchorRegex = /<a[\s\S]*?href=["'](.*?)["'][\s\S]*?>(.*?)<\/a>/ig;
+	let result;
+	while ((result = htmlAnchorRegex.exec(md)) !== null) {
+		output.push({
+			url: result[1],
+			title: result[2],
+		});
+	}
+
+	return output;
+};
--- a/packages/utils/package.json
+++ b/packages/utils/package.json
@ -8,11 +8,12 @@
    "./env": "./dist/env.js",
    "./fs": "./dist/fs.js",
    "./html": "./dist/html.js",
+    "./Logger": "./dist/Logger.js",
+    "./markdown": "./dist/markdown.js",
    "./net": "./dist/net.js",
    "./time": "./dist/time.js",
    "./types": "./dist/types.js",
-    "./url": "./dist/url.js",
-    "./Logger": "./dist/Logger.js"
+    "./url": "./dist/url.js"
  },
  "publishConfig": {
    "access": "public"
@ -27,11 +28,13 @@
  "author": "",
  "license": "AGPL-3.0-or-later",
  "dependencies": {
+    "@joplin/fork-htmlparser2": "^4.1.50",
    "async-mutex": "0.4.0",
    "execa": "5.1.1",
    "fs-extra": "11.1.1",
    "glob": "10.3.10",
    "html-entities": "1.4.0",
+    "markdown-it": "13.0.2",
    "moment": "2.29.4",
    "node-fetch": "2.6.7",
    "sprintf-js": "1.1.3"
@ -39,6 +42,7 @@
  "devDependencies": {
    "@types/fs-extra": "11.0.4",
    "@types/jest": "29.5.8",
+    "@types/markdown-it": "13.0.7",
    "@types/node-fetch": "2.6.9",
    "jest": "29.7.0",
    "ts-jest": "29.1.1"
--- a/packages/utils/types.ts
+++ b/packages/utils/types.ts
@ -1,6 +1,9 @@
-/* eslint-disable import/prefer-default-export */
-
 export interface Size {
 	width?: number;
 	height?: number;
 }
+
+export interface Link {
+	title: string;
+	url: string;
+}
--- a/readme/apps/import_export.md
+++ b/readme/apps/import_export.md
@ -6,10 +6,12 @@

 Joplin was designed as a replacement for Evernote and so can import complete Evernote notebooks, as well as notes, tags, resources (attached files) and note metadata (such as author, geo-location, etc.) via ENEX files. In terms of data, the only two things that might slightly differ are:

- Recognition data - Evernote images, in particular scanned (or photographed) documents have [recognition data](https://en.wikipedia.org/wiki/Optical_character_recognition) associated with them. It is the text that Evernote has been able to recognise in the document. This data is not preserved when the note are imported into Joplin. However, should it become supported in the search tool or other parts of Joplin, it should be possible to regenerate this recognition data since the actual image would still be available.
+- Recognition data - Evernote images, in particular scanned (or photographed) documents have [recognition data](https://en.wikipedia.org/wiki/Optical_character_recognition) associated with them. It is the text that Evernote has been able to recognise in the document. This data is not preserved when the note are imported into Joplin. However, if you have enabled OCR in Joplin, that recognition data will be recreated in a format compatible with Joplin.

 - Colour, font sizes and faces - Evernote text is stored as HTML and this is converted to Markdown during the import process. For notes that are mostly plain text or with basic formatting (bold, italic, bullet points, links, etc.) this is a lossless conversion, and the note, once rendered back to HTML should be very similar. Tables are also imported and converted to Markdown tables. For very complex notes, some formatting data might be lost - in particular colours, font sizes and font faces will not be imported. The text itself however is always imported in full regardless of formatting. If it is essential that this extra data is preserved then Joplin also allows import of ENEX files as HTML.

+- Links between notes are mostly preserved. However the ENEX format does not include all the necessary information to find out what the target of a link is (specifically, Evernote use an ID for the link but that ID is not associated with the target note). Instead Joplin tries to guess what note is linked based on the note title, which mostly works, but not always - for example if multiple notes have the same title, or if the link title is different from the target note title. If Joplin cannot guess how to restore the link, the Evernote link will remain.
+
 To import Evernote data, first export your Evernote notebooks to ENEX files as described [here](https://help.evernote.com/hc/en-us/articles/209005557-How-to-back-up-export-and-restore-import-notes-and-notebooks). Then follow these steps:

 In the **desktop application**, open File > Import > ENEX and select your file. The notes will be imported into a new separate notebook. If needed they can then be moved to a different notebook, or the notebook can be renamed, etc.
--- a/yarn.lock
+++ b/yarn.lock
@ -6833,6 +6833,7 @@ __metadata:
    "@types/fs-extra": 11.0.4
    "@types/jest": 29.5.8
    "@types/js-yaml": 4.0.9
+    "@types/markdown-it": 13.0.7
    "@types/nanoid": 3.0.0
    "@types/node": 18.18.14
    "@types/node-rsa": 1.1.4
@ -7185,8 +7186,10 @@ __metadata:
  version: 0.0.0-use.local
  resolution: "@joplin/utils@workspace:packages/utils"
  dependencies:
+    "@joplin/fork-htmlparser2": ^4.1.50
    "@types/fs-extra": 11.0.4
    "@types/jest": 29.5.8
+    "@types/markdown-it": 13.0.7
    "@types/node-fetch": 2.6.9
    async-mutex: 0.4.0
    execa: 5.1.1
@ -7194,6 +7197,7 @@ __metadata:
    glob: 10.3.10
    html-entities: 1.4.0
    jest: 29.7.0
+    markdown-it: 13.0.2
    moment: 2.29.4
    node-fetch: 2.6.7
    sprintf-js: 1.1.3