1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-11-24 08:12:24 +02:00

Desktop, Cli: Resolves #9596: Restore note links after importing an ENEX file

This commit is contained in:
Laurent Cozic 2023-12-26 11:19:29 +00:00
parent 0873b1900b
commit 590769b1ae
17 changed files with 345 additions and 14 deletions

View File

@ -0,0 +1,49 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
<en-export export-date="20231224T151504Z" application="Evernote" version="10.68.2">
<note>
<title>Note 1</title>
<created>20160730T164129Z</created>
<updated>20231224T151443Z</updated>
<content>
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div><a href="evernote:///view/5223870/s49/9cd5e810-fa03-429a-8194-ab847f2f1ab2/c99d9e01-ca35-4c75-ba63-f0c0ef97787d/" rel="noopener noreferrer" rev="en_rl_none">Note 2</a><a href="evernote:///view/5223870/s49/9cd5e810-fa03-429a-8194-ab847f2f1ab2/c99d9e01-ca35-4c75-ba63-f0c0ef97787d/" rel="noopener noreferrer" rev="en_rl_none">Note 3</a></div></en-note> ]]>
</content>
</note>
<note>
<title>Note 2</title>
<created>20160730T111759Z</created>
<updated>20160730T111807Z</updated>
<content>
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>Testing</div></en-note> ]]>
</content>
</note>
<note>
<title>Note 3</title>
<created>20160730T111759Z</created>
<updated>20160730T111807Z</updated>
<content>
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div><a href="evernote:///view/5223870/s49/9cd5e810-fa03-429a-8194-ab847f2f1ab2/c99d9e01-ca35-4c75-ba63-f0c0ef97787d/" rel="noopener noreferrer" rev="en_rl_none">Ambiguous note</a></div></en-note> ]]>
</content>
</note>
<note>
<title>Ambiguous note</title>
<created>20160730T111759Z</created>
<updated>20160730T111807Z</updated>
<content>
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>Testing</div></en-note> ]]>
</content>
</note>
<note>
<title>Ambiguous note</title>
<created>20160730T111759Z</created>
<updated>20160730T111807Z</updated>
<content>
<![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"><en-note><div>Testing</div></en-note> ]]>
</content>
</note>
</en-export>

View File

@ -214,4 +214,17 @@ describe('import-enex-md-gen', () => {
expect(resource.title).toBe('08.06.2014 16:58:55');
});
it('should resolve note links', async () => {
await importEnexFile('linked_notes.enex');
const notes: NoteEntity[] = await Note.all();
const note1 = notes.find(n => n.title === 'Note 1');
const note2 = notes.find(n => n.title === 'Note 2');
const note3 = notes.find(n => n.title === 'Note 3');
expect(notes.length).toBe(5);
expect(note1.body).toBe(`[Note 2](:/${note2.id})[Note 3](:/${note3.id})`);
expect(note3.body).toBe('[Ambiguous note](evernote:///view/5223870/s49/9cd5e810-fa03-429a-8194-ab847f2f1ab2/c99d9e01-ca35-4c75-ba63-f0c0ef97787d/)');
});
});

View File

@ -58,6 +58,7 @@ interface ParserState {
spanAttributes: string[];
tags: ParserStateTag[];
currentCode?: string;
evernoteLinkTitles: Record<string, string>;
}
@ -607,6 +608,7 @@ function enexXmlToMdArray(stream: any, resources: ResourceEntity[], tasks: Extra
anchorAttributes: [],
spanAttributes: [],
tags: [],
evernoteLinkTitles: {},
};
const options = {};

View File

@ -10,6 +10,8 @@ import { NoteEntity, ResourceEntity } from './services/database/types';
import { enexXmlToMd } from './import-enex-md-gen';
import { MarkupToHtml } from '@joplin/renderer';
import { fileExtension, friendlySafeFilename, safeFileExtension } from './path-utils';
import { extractUrls as extractUrlsFromHtml } from '@joplin/utils/html';
import { extractUrls as extractUrlsFromMarkdown } from '@joplin/utils/markdown';
const moment = require('moment');
const { wrapError } = require('./errorUtils');
const { enexXmlToHtml } = require('./import-enex-html-gen.js');
@ -435,6 +437,15 @@ export default async function importEnex(parentFolderId: string, filePath: strin
processingNotes = true;
stream.pause();
// Set the note ID so that we can create a title-to-id map, which
// will be needed to recreate the note links below.
const noteTitleToId: Record<string, string[]> = {};
for (const note of notes) {
if (!noteTitleToId[note.title]) noteTitleToId[note.title] = [];
note.id = uuid.create();
noteTitleToId[note.title].push(note.id);
}
while (notes.length) {
const note = notes.shift();
@ -452,20 +463,40 @@ export default async function importEnex(parentFolderId: string, filePath: strin
note.resources[i] = resource;
}
const body = importOptions.outputFormat === 'html' ?
// --------------------------------------------------------
// Convert the ENEX body to either Markdown or HTML
// --------------------------------------------------------
let body: string = importOptions.outputFormat === 'html' ?
await enexXmlToHtml(note.bodyXml, note.resources) :
await enexXmlToMd(note.bodyXml, note.resources, note.tasks);
delete note.bodyXml;
// --------------------------------------------------------
// Convert the Evernote note links to Joplin note links. If
// we don't find a matching note, or if there are multiple
// matching notes, we leave the Evernote links as is.
// --------------------------------------------------------
const links = importOptions.outputFormat === 'html' ?
extractUrlsFromHtml(body) :
extractUrlsFromMarkdown(body);
for (const link of links) {
const matchingNoteIds = noteTitleToId[link.title];
if (matchingNoteIds && matchingNoteIds.length === 1) {
body = body.replace(link.url, `:/${matchingNoteIds[0]}`);
}
}
// --------------------------------------------------------
// Finish setting up the note
// --------------------------------------------------------
note.markup_language = importOptions.outputFormat === 'html' ?
MarkupToHtml.MARKUP_LANGUAGE_HTML :
MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;
// console.info('*************************************************************************');
// console.info(body);
// console.info('*************************************************************************');
note.id = uuid.create();
note.parent_id = parentFolderId;
note.body = body;

View File

@ -90,5 +90,4 @@ describe('Should detect list items', () => {
test('should NOT detect `+ [x]` as empty list item ', () => {
expect(markdownUtils.isEmptyListItem('+ [x]')).toBe(false);
});
});

View File

@ -1,7 +1,7 @@
import { validateLinks } from '@joplin/renderer';
const stringPadding = require('string-padding');
const urlUtils = require('./urlUtils');
const MarkdownIt = require('markdown-it');
import * as MarkdownIt from 'markdown-it';
// Taken from codemirror/addon/edit/continuelist.js
const listRegex = /^(\s*)([*+-] \[[x ]\]\s|[*+-]\s|(\d+)([.)]\s))(\s*)/;

View File

@ -19,6 +19,7 @@
"@types/fs-extra": "11.0.4",
"@types/jest": "29.5.8",
"@types/js-yaml": "4.0.9",
"@types/markdown-it": "13.0.7",
"@types/node": "18.18.14",
"@types/node-rsa": "1.1.4",
"@types/react": "18.2.41",

View File

@ -0,0 +1,60 @@
import { extractUrls } from './html';
import { Link } from './types';
describe('htmlUtils', () => {
test.each([
[
'',
[],
],
[
'bla >Testing <b>no link</b>"',
[],
],
[
'bla <a href="https://example.com">Testing <b>link</b></a>"',
[
{
url: 'https://example.com',
title: 'Testing link',
},
],
],
[
'<a href="#">Test 1</a> <a onclick="">Test 2</a>',
[
{
url: '#',
title: 'Test 1',
},
{
url: '',
title: 'Test 2',
},
],
],
[
'<a href="https://example.com"><img src="https://test.com/image.png"/></a>',
[
{
url: 'https://example.com',
title: '',
},
],
],
[
'<a href="#">check &amp; encoding</a>',
[
{
url: '#',
title: 'check & encoding',
},
],
],
])('should retrieve links', (html: string, expected: Link[]) => {
const actual = extractUrls(html);
expect(actual).toEqual(expected);
});
});

View File

@ -1,6 +1,7 @@
/* eslint-disable import/prefer-default-export */
import { Link } from './types';
const Entities = require('html-entities').AllHtmlEntities;
const htmlparser2 = require('@joplin/fork-htmlparser2');
const selfClosingElements = [
'area',
@ -40,3 +41,40 @@ export const attributesHtml = (attr: Record<string, any>) => {
export const isSelfClosingTag = (tagName: string) => {
return selfClosingElements.includes(tagName.toLowerCase());
};
export const extractUrls = (html: string) => {
if (!html || !html.trim()) return [];
const output: Link[] = [];
let currentLink: Link|null = null;
const parser = new htmlparser2.Parser({
onopentag: (name: string, attrs: Record<string, string>) => {
if (name === 'a') {
currentLink = {
url: attrs && attrs.href ? attrs.href : '',
title: '',
};
}
},
ontext: (text: string) => {
if (currentLink) currentLink.title += text;
},
onclosetag: (name: string) => {
if (name === 'a') {
if (!currentLink) throw new Error('Found a closing anchor tag without an opening one');
output.push(currentLink);
currentLink = null;
}
},
}, { decodeEntities: true });
parser.write(html);
parser.end();
return output;
};

View File

@ -16,4 +16,6 @@ module.exports = {
testPathIgnorePatterns: ['<rootDir>/node_modules/'],
slowTestThreshold: 40,
setupFilesAfterEnv: [`${__dirname}/jest.setup.js`],
};

View File

@ -0,0 +1 @@
require('../../jest.base-setup.js')();

View File

@ -0,0 +1,55 @@
import { extractUrls } from './markdown';
import { Link } from './types';
describe('markdown', () => {
test.each([
[
'',
[],
],
[
'Some text and no links',
[],
],
[
'[](https://example.com)',
[
{
url: 'https://example.com',
title: '',
},
],
],
[
'before [testing](https://example.com) [testing **with bold**](https://example2.com) after',
[
{
url: 'https://example.com',
title: 'testing',
},
{
url: 'https://example2.com',
title: 'testing with bold',
},
],
],
[
'[Testing MD](https://example.com/md) <a href="https://example.com/html">Testing HTML</a>',
[
{
url: 'https://example.com/md',
title: 'Testing MD',
},
{
url: 'https://example.com/html',
title: 'Testing HTML',
},
],
],
])('should extract URLs', (md: string, expected: Link[]) => {
const actual = extractUrls(md);
expect(actual).toEqual(expected);
});
});

View File

@ -0,0 +1,67 @@
/* eslint-disable import/prefer-default-export */
import * as MarkdownIt from 'markdown-it';
import { Link } from './types';
// enable file link URLs in MarkdownIt. Keeps other URL restrictions of
// MarkdownIt untouched. Format [link name](file://...)
const validateLinks = (url: string) => {
const BAD_PROTO_RE = /^(vbscript|javascript|data):/;
const GOOD_DATA_RE = /^data:image\/(gif|png|jpeg|webp);/;
// url should be normalized at this point, and existing entities are decoded
const str = url.trim().toLowerCase();
if (str.indexOf('data:image/svg+xml,') === 0) {
return true;
}
return BAD_PROTO_RE.test(str) ? (!!GOOD_DATA_RE.test(str)) : true;
};
// Note that the title is stripped of any Markdown code. So `title with
// **bold**` will become `title with bold`. Links are extracted both from
// Markdown and from HTML links.
export const extractUrls = (md: string): Link[] => {
const markdownIt = new MarkdownIt();
markdownIt.validateLink = validateLinks; // Necessary to support file:/// links
const env = {};
const tokens = markdownIt.parse(md, env);
const output: Link[] = [];
const searchUrls = (tokens: MarkdownIt.Token[], currentLink: Link|null) => {
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i];
if (token.type === 'link_open') {
currentLink = {
title: '',
url: token.attrGet('href') || '',
};
} else if (token.type === 'link_close') {
if (!currentLink) throw new Error('Found a link_close without a link_open');
output.push(currentLink);
currentLink = null;
} else if (token.children && token.children.length) {
searchUrls(token.children, currentLink);
} else if (token.type === 'text' && currentLink) {
currentLink.title += token.content;
}
}
};
searchUrls(tokens, null);
// Definitely won't work in all cases but for our particular use case,
// processing Markdown generated from ENEX documents, that should be enough.
const htmlAnchorRegex = /<a[\s\S]*?href=["'](.*?)["'][\s\S]*?>(.*?)<\/a>/ig;
let result;
while ((result = htmlAnchorRegex.exec(md)) !== null) {
output.push({
url: result[1],
title: result[2],
});
}
return output;
};

View File

@ -8,11 +8,12 @@
"./env": "./dist/env.js",
"./fs": "./dist/fs.js",
"./html": "./dist/html.js",
"./Logger": "./dist/Logger.js",
"./markdown": "./dist/markdown.js",
"./net": "./dist/net.js",
"./time": "./dist/time.js",
"./types": "./dist/types.js",
"./url": "./dist/url.js",
"./Logger": "./dist/Logger.js"
"./url": "./dist/url.js"
},
"publishConfig": {
"access": "public"
@ -27,11 +28,13 @@
"author": "",
"license": "AGPL-3.0-or-later",
"dependencies": {
"@joplin/fork-htmlparser2": "^4.1.50",
"async-mutex": "0.4.0",
"execa": "5.1.1",
"fs-extra": "11.1.1",
"glob": "10.3.10",
"html-entities": "1.4.0",
"markdown-it": "13.0.2",
"moment": "2.29.4",
"node-fetch": "2.6.7",
"sprintf-js": "1.1.3"
@ -39,6 +42,7 @@
"devDependencies": {
"@types/fs-extra": "11.0.4",
"@types/jest": "29.5.8",
"@types/markdown-it": "13.0.7",
"@types/node-fetch": "2.6.9",
"jest": "29.7.0",
"ts-jest": "29.1.1"

View File

@ -1,6 +1,9 @@
/* eslint-disable import/prefer-default-export */
export interface Size {
width?: number;
height?: number;
}
export interface Link {
title: string;
url: string;
}

View File

@ -6,10 +6,12 @@
Joplin was designed as a replacement for Evernote and so can import complete Evernote notebooks, as well as notes, tags, resources (attached files) and note metadata (such as author, geo-location, etc.) via ENEX files. In terms of data, the only two things that might slightly differ are:
- Recognition data - Evernote images, in particular scanned (or photographed) documents have [recognition data](https://en.wikipedia.org/wiki/Optical_character_recognition) associated with them. It is the text that Evernote has been able to recognise in the document. This data is not preserved when the note are imported into Joplin. However, should it become supported in the search tool or other parts of Joplin, it should be possible to regenerate this recognition data since the actual image would still be available.
- Recognition data - Evernote images, in particular scanned (or photographed) documents have [recognition data](https://en.wikipedia.org/wiki/Optical_character_recognition) associated with them. It is the text that Evernote has been able to recognise in the document. This data is not preserved when the note are imported into Joplin. However, if you have enabled OCR in Joplin, that recognition data will be recreated in a format compatible with Joplin.
- Colour, font sizes and faces - Evernote text is stored as HTML and this is converted to Markdown during the import process. For notes that are mostly plain text or with basic formatting (bold, italic, bullet points, links, etc.) this is a lossless conversion, and the note, once rendered back to HTML should be very similar. Tables are also imported and converted to Markdown tables. For very complex notes, some formatting data might be lost - in particular colours, font sizes and font faces will not be imported. The text itself however is always imported in full regardless of formatting. If it is essential that this extra data is preserved then Joplin also allows import of ENEX files as HTML.
- Links between notes are mostly preserved. However the ENEX format does not include all the necessary information to find out what the target of a link is (specifically, Evernote use an ID for the link but that ID is not associated with the target note). Instead Joplin tries to guess what note is linked based on the note title, which mostly works, but not always - for example if multiple notes have the same title, or if the link title is different from the target note title. If Joplin cannot guess how to restore the link, the Evernote link will remain.
To import Evernote data, first export your Evernote notebooks to ENEX files as described [here](https://help.evernote.com/hc/en-us/articles/209005557-How-to-back-up-export-and-restore-import-notes-and-notebooks). Then follow these steps:
In the **desktop application**, open File > Import > ENEX and select your file. The notes will be imported into a new separate notebook. If needed they can then be moved to a different notebook, or the notebook can be renamed, etc.

View File

@ -6833,6 +6833,7 @@ __metadata:
"@types/fs-extra": 11.0.4
"@types/jest": 29.5.8
"@types/js-yaml": 4.0.9
"@types/markdown-it": 13.0.7
"@types/nanoid": 3.0.0
"@types/node": 18.18.14
"@types/node-rsa": 1.1.4
@ -7185,8 +7186,10 @@ __metadata:
version: 0.0.0-use.local
resolution: "@joplin/utils@workspace:packages/utils"
dependencies:
"@joplin/fork-htmlparser2": ^4.1.50
"@types/fs-extra": 11.0.4
"@types/jest": 29.5.8
"@types/markdown-it": 13.0.7
"@types/node-fetch": 2.6.9
async-mutex: 0.4.0
execa: 5.1.1
@ -7194,6 +7197,7 @@ __metadata:
glob: 10.3.10
html-entities: 1.4.0
jest: 29.7.0
markdown-it: 13.0.2
moment: 2.29.4
node-fetch: 2.6.7
sprintf-js: 1.1.3