From 20b1c2e7cbd78676014e733801acc255ad4ae773 Mon Sep 17 00:00:00 2001 From: Laurent Cozic Date: Mon, 8 Jan 2024 10:48:51 +0000 Subject: [PATCH] Desktop, Cli: Fix ENEX import issue Ref: https://discourse.joplinapp.org/t/error-importing-notes-from-format-enex/35001 --- packages/lib/import-enex-md-gen.test.ts | 8 +- packages/lib/import-enex-md-gen.ts | 2 - packages/lib/import-enex.ts | 196 +++++++++++------------- packages/lib/package.json | 1 - yarn.lock | 8 - 5 files changed, 90 insertions(+), 125 deletions(-) diff --git a/packages/lib/import-enex-md-gen.test.ts b/packages/lib/import-enex-md-gen.test.ts index 6923c5ec6..ac5663500 100644 --- a/packages/lib/import-enex-md-gen.test.ts +++ b/packages/lib/import-enex-md-gen.test.ts @@ -6,16 +6,16 @@ const os = require('os'); const { filename } = require('./path-utils'); import { setupDatabaseAndSynchronizer, switchClient, expectNotThrow, supportDir, expectThrow } from './testing/test-utils'; const { enexXmlToMd } = require('./import-enex-md-gen.js'); -import importEnex from './import-enex'; +import importEnex, { ImportOptions } from './import-enex'; import Note from './models/Note'; import Tag from './models/Tag'; import Resource from './models/Resource'; const enexSampleBaseDir = `${supportDir}/../enex_to_md`; -const importEnexFile = async (filename: string) => { +const importEnexFile = async (filename: string, options: ImportOptions = null) => { const filePath = `${enexSampleBaseDir}/${filename}`; - await importEnex('', filePath); + await importEnex('', filePath, options); }; const readExpectedFile = async (filename: string) => { @@ -221,7 +221,7 @@ describe('import-enex-md-gen', () => { }); it('should resolve note links', async () => { - await importEnexFile('linked_notes.enex'); + await importEnexFile('linked_notes.enex', { batchSize: 1 }); const notes: NoteEntity[] = await Note.all(); const note1 = notes.find(n => n.title === 'Note 1'); diff --git a/packages/lib/import-enex-md-gen.ts b/packages/lib/import-enex-md-gen.ts index 156518d7d..b27a1115a 100644 --- a/packages/lib/import-enex-md-gen.ts +++ b/packages/lib/import-enex-md-gen.ts @@ -58,7 +58,6 @@ interface ParserState { spanAttributes: string[]; tags: ParserStateTag[]; currentCode?: string; - evernoteLinkTitles: Record; } @@ -608,7 +607,6 @@ function enexXmlToMdArray(stream: any, resources: ResourceEntity[], tasks: Extra anchorAttributes: [], spanAttributes: [], tags: [], - evernoteLinkTitles: {}, }; const options = {}; diff --git a/packages/lib/import-enex.ts b/packages/lib/import-enex.ts index d40212db1..4866b0fde 100644 --- a/packages/lib/import-enex.ts +++ b/packages/lib/import-enex.ts @@ -1,10 +1,8 @@ import uuid from './uuid'; -import BaseModel from './BaseModel'; import Note from './models/Note'; import Tag from './models/Tag'; import Resource from './models/Resource'; import Setting from './models/Setting'; -import time from './time'; import shim from './shim'; import { NoteEntity, ResourceEntity } from './services/database/types'; import { enexXmlToMd } from './import-enex-md-gen'; @@ -15,7 +13,6 @@ import { extractUrls as extractUrlsFromMarkdown } from '@joplin/utils/markdown'; const moment = require('moment'); const { wrapError } = require('./errorUtils'); const { enexXmlToHtml } = require('./import-enex-html-gen.js'); -const Levenshtein = require('levenshtein'); const md5 = require('md5'); const { Base64Decode } = require('base64-stream'); const md5File = require('md5-file'); @@ -96,38 +93,6 @@ function removeUndefinedProperties(note: NoteEntity) { return output; } -function levenshteinPercent(s1: string, s2: string) { - const l = new Levenshtein(s1, s2); - if (!s1.length || !s2.length) return 1; - return Math.abs(l.distance / s1.length); -} - -async function fuzzyMatch(note: ExtractedNote) { - if (note.created_time < time.unixMs() - 1000 * 60 * 60 * 24 * 360) { - const notes = await Note.modelSelectAll('SELECT * FROM notes WHERE is_conflict = 0 AND created_time = ? AND title = ?', [note.created_time, note.title]); - return notes.length !== 1 ? null : notes[0]; - } - - const notes = await Note.modelSelectAll('SELECT * FROM notes WHERE is_conflict = 0 AND created_time = ?', [note.created_time]); - if (notes.length === 0) return null; - if (notes.length === 1) return notes[0]; - - let lowestL = 1; - let lowestN = null; - for (let i = 0; i < notes.length; i++) { - const n = notes[i]; - const l = levenshteinPercent(note.title, n.title); - if (l < lowestL) { - lowestL = l; - lowestN = n; - } - } - - if (lowestN && lowestL < 0.2) return lowestN; - - return null; -} - interface ExtractedResource { hasData?: boolean; id?: string; @@ -155,6 +120,14 @@ interface ExtractedNote extends NoteEntity { bodyXml?: string; } +// Those are the notes that have been parsed and saved to Joplin. We don't keep +// in memory the whole `ExtractedNote` because it contains resource data, etc. +// We only keep what is needed to restore the note links. +interface SavedNote { + id: string; + body: string; +} + // At this point we have the resource as it's been parsed from the XML, but // additional processing needs to be done to get the final resource file, its // size, MD5, etc. @@ -245,26 +218,19 @@ async function saveNoteTags(note: ExtractedNote) { return notesTagged; } -interface ImportOptions { - fuzzyMatching?: boolean; +export interface ImportOptions { // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied onProgress?: Function; // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied onError?: Function; outputFormat?: string; + batchSize?: number; } -async function saveNoteToStorage(note: ExtractedNote, importOptions: ImportOptions) { - importOptions = { fuzzyMatching: false, ...importOptions }; - +async function saveNoteToStorage(note: ExtractedNote) { note = Note.filter(note as any); - const existingNote = importOptions.fuzzyMatching ? await fuzzyMatch(note) : null; - const result = { - noteCreated: false, - noteUpdated: false, - noteSkipped: false, resourcesCreated: 0, notesTagged: 0, }; @@ -275,28 +241,10 @@ async function saveNoteToStorage(note: ExtractedNote, importOptions: ImportOptio const notesTagged = await saveNoteTags(note); result.notesTagged += notesTagged; - if (existingNote) { - const diff = BaseModel.diffObjects(existingNote, note); - delete diff.tags; - delete diff.resources; - delete diff.id; - - if (!Object.getOwnPropertyNames(diff).length) { - result.noteSkipped = true; - return result; - } - - diff.id = existingNote.id; - diff.type_ = existingNote.type_; - await Note.save(diff, { autoTimestamp: false }); - result.noteUpdated = true; - } else { - await Note.save(note, { - isNew: true, - autoTimestamp: false, - }); - result.noteCreated = true; - } + await Note.save(note, { + isNew: true, + autoTimestamp: false, + }); return result; } @@ -345,12 +293,47 @@ const preProcessFile = async (filePath: string): Promise => { // return newFilePath; }; -export default async function importEnex(parentFolderId: string, filePath: string, importOptions: ImportOptions = null) { - if (!importOptions) importOptions = {}; - if (!('fuzzyMatching' in importOptions)) importOptions.fuzzyMatching = false; - if (!('onProgress' in importOptions)) importOptions.onProgress = function() {}; - if (!('onError' in importOptions)) importOptions.onError = function() {}; +const restoreNoteLinks = async (notes: SavedNote[], noteTitlesToIds: Record, importOptions: ImportOptions) => { + // -------------------------------------------------------- + // Convert the Evernote note links to Joplin note links. If + // we don't find a matching note, or if there are multiple + // matching notes, we leave the Evernote links as is. + // -------------------------------------------------------- + + for (const note of notes) { + const links = importOptions.outputFormat === 'html' ? + extractUrlsFromHtml(note.body) : + extractUrlsFromMarkdown(note.body); + + let noteChanged = false; + + for (const link of links) { + const matchingNoteIds = noteTitlesToIds[link.title]; + if (matchingNoteIds && matchingNoteIds.length === 1) { + note.body = note.body.replace(link.url, `:/${matchingNoteIds[0]}`); + noteChanged = true; + } + } + + if (noteChanged) { + await Note.save({ + id: note.id, + body: note.body, + updated_time: Date.now(), + }, { + autoTimestamp: false, + }); + } + } +}; + +interface ParseNotesResult { + savedNotes: SavedNote[]; + noteTitlesToIds: Record; +} + +const parseNotes = async (parentFolderId: string, filePath: string, importOptions: ImportOptions = null): Promise => { // eslint-disable-next-line @typescript-eslint/ban-types -- Old code before rule was applied function handleSaxStreamEvent(fn: Function) { return function(...args: any[]) { @@ -397,6 +380,9 @@ export default async function importEnex(parentFolderId: string, filePath: strin let noteResourceRecognition: NoteResourceRecognition = null; const notes: ExtractedNote[] = []; let processingNotes = false; + const savedNotes: SavedNote[] = []; + const createdNoteIds: string[] = []; + const noteTitlesToIds: Record = {}; const createErrorWithNoteTitle = (fnThis: any, error: any) => { const line = []; @@ -437,15 +423,6 @@ export default async function importEnex(parentFolderId: string, filePath: strin processingNotes = true; stream.pause(); - // Set the note ID so that we can create a title-to-id map, which - // will be needed to recreate the note links below. - const noteTitleToId: Record = {}; - for (const note of notes) { - if (!noteTitleToId[note.title]) noteTitleToId[note.title] = []; - note.id = uuid.create(); - noteTitleToId[note.title].push(note.id); - } - while (notes.length) { const note = notes.shift(); @@ -467,32 +444,16 @@ export default async function importEnex(parentFolderId: string, filePath: strin // Convert the ENEX body to either Markdown or HTML // -------------------------------------------------------- - let body: string = importOptions.outputFormat === 'html' ? + const body: string = importOptions.outputFormat === 'html' ? await enexXmlToHtml(note.bodyXml, note.resources) : await enexXmlToMd(note.bodyXml, note.resources, note.tasks); delete note.bodyXml; - // -------------------------------------------------------- - // Convert the Evernote note links to Joplin note links. If - // we don't find a matching note, or if there are multiple - // matching notes, we leave the Evernote links as is. - // -------------------------------------------------------- - - const links = importOptions.outputFormat === 'html' ? - extractUrlsFromHtml(body) : - extractUrlsFromMarkdown(body); - - for (const link of links) { - const matchingNoteIds = noteTitleToId[link.title]; - if (matchingNoteIds && matchingNoteIds.length === 1) { - body = body.replace(link.url, `:/${matchingNoteIds[0]}`); - } - } - // -------------------------------------------------------- // Finish setting up the note // -------------------------------------------------------- + note.id = uuid.create(); note.markup_language = importOptions.outputFormat === 'html' ? MarkupToHtml.MARKUP_LANGUAGE_HTML : MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN; @@ -511,15 +472,17 @@ export default async function importEnex(parentFolderId: string, filePath: strin // that case if (!note.updated_time) note.updated_time = note.created_time; - const result = await saveNoteToStorage(note, importOptions); + const result = await saveNoteToStorage(note); - if (result.noteUpdated) { - progressState.updated++; - } else if (result.noteCreated) { - progressState.created++; - } else if (result.noteSkipped) { - progressState.skipped++; - } + createdNoteIds.push(note.id); + if (!noteTitlesToIds[note.title]) noteTitlesToIds[note.title] = []; + noteTitlesToIds[note.title].push(note.id); + savedNotes.push({ + id: note.id, + body: note.body, + }); + + progressState.created++; progressState.resourcesCreated += result.resourcesCreated; progressState.notesTagged += result.notesTagged; importOptions.onProgress(progressState); @@ -648,7 +611,7 @@ export default async function importEnex(parentFolderId: string, filePath: strin notes.push(note); - if (notes.length >= 10) { + if (notes.length >= importOptions.batchSize) { // eslint-disable-next-line promise/prefer-await-to-then -- Old code before rule was applied processNotes().catch(error => { importOptions.onError(createErrorWithNoteTitle(this, error)); @@ -718,12 +681,25 @@ export default async function importEnex(parentFolderId: string, filePath: strin if (allDone) { shim.clearTimeout(iid); if (needToDeleteFileToProcess) void shim.fsDriver().remove(fileToProcess); - resolve(null); + resolve({ + savedNotes, + noteTitlesToIds, + }); } }); - }, 500); + }, 1000); })); stream.pipe(saxStream); }); +}; + +export default async function importEnex(parentFolderId: string, filePath: string, importOptions: ImportOptions = null) { + if (!importOptions) importOptions = {}; + if (!('onProgress' in importOptions)) importOptions.onProgress = function() {}; + if (!('onError' in importOptions)) importOptions.onError = function() {}; + if (!('batchSize' in importOptions)) importOptions.batchSize = 10; + + const result = await parseNotes(parentFolderId, filePath, importOptions); + await restoreNoteLinks(result.savedNotes, result.noteTitlesToIds, importOptions); } diff --git a/packages/lib/package.json b/packages/lib/package.json index 5cf070054..f3c17d24a 100644 --- a/packages/lib/package.json +++ b/packages/lib/package.json @@ -66,7 +66,6 @@ "image-type": "3.1.0", "immer": "7.0.15", "js-yaml": "4.1.0", - "levenshtein": "1.0.5", "markdown-it": "13.0.2", "md5": "2.3.0", "md5-file": "5.0.0", diff --git a/yarn.lock b/yarn.lock index 5fa2c5ed7..1df7dea4c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6871,7 +6871,6 @@ __metadata: immer: 7.0.15 jest: 29.7.0 js-yaml: 4.1.0 - levenshtein: 1.0.5 markdown-it: 13.0.2 md5: 2.3.0 md5-file: 5.0.0 @@ -27688,13 +27687,6 @@ __metadata: languageName: node linkType: hard -"levenshtein@npm:1.0.5": - version: 1.0.5 - resolution: "levenshtein@npm:1.0.5" - checksum: d5ceca3bfc4804ad50515291841d968eea5f1f740310c21b5ae6cb6d5514ee68b9c00405059f36934611d3258967bad6d306dcf299f446c7cdd25bdda2c4720c - languageName: node - linkType: hard - "levn@npm:^0.4.1": version: 0.4.1 resolution: "levn@npm:0.4.1"