1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-12 08:54:00 +02:00
joplin/packages/lib/import-enex.ts

654 lines
20 KiB
TypeScript
Raw Normal View History

import uuid from './uuid';
import BaseModel from './BaseModel';
import Note from './models/Note';
import Tag from './models/Tag';
import Resource from './models/Resource';
import Setting from './models/Setting';
import time from './time';
import shim from './shim';
import { NoteEntity } from './services/database/types';
import { enexXmlToMd } from './import-enex-md-gen';
import { MarkupToHtml } from '@joplin/renderer';
const moment = require('moment');
const { wrapError } = require('./errorUtils');
const { enexXmlToHtml } = require('./import-enex-html-gen.js');
const Levenshtein = require('levenshtein');
const md5 = require('md5');
const { Base64Decode } = require('base64-stream');
const md5File = require('md5-file');
const { mime } = require('./mime-utils');
2017-06-06 23:40:09 +02:00
2019-10-09 21:35:13 +02:00
// const Promise = require('promise');
const fs = require('fs-extra');
2017-06-24 19:40:03 +02:00
function dateToTimestamp(s: string, defaultValue: number = null): number {
// Most dates seem to be in this format
let m = moment(s, 'YYYYMMDDTHHmmssZ');
// But sometimes they might be in this format eg. 20180306T91108 AMZ
// https://github.com/laurent22/joplin/issues/557
if (!m.isValid()) m = moment(s, 'YYYYMMDDThmmss AZ');
2017-06-06 23:40:09 +02:00
if (!m.isValid()) {
if (defaultValue !== null) return defaultValue;
2019-09-19 23:51:18 +02:00
throw new Error(`Invalid date: ${s}`);
2017-06-06 23:27:03 +02:00
}
2017-06-24 19:40:03 +02:00
return m.toDate().getTime();
2017-06-06 23:40:09 +02:00
}
function extractRecognitionObjId(recognitionXml: string) {
2017-06-09 00:24:40 +02:00
const r = recognitionXml.match(/objID="(.*?)"/);
return r && r.length >= 2 ? r[1] : null;
}
2017-06-06 23:40:09 +02:00
async function decodeBase64File(sourceFilePath: string, destFilePath: string) {
// When something goes wrong with streams you can get an error "EBADF, Bad file descriptor"
// with no strack trace to tell where the error happened.
// Also note that this code is not great because there's a source and a destination stream
// and while one stream might end, the other might throw an error or vice-versa. However
// we can only throw one error from a promise. So before one stream
// could end with resolve(), then another stream would get an error and call reject(), which
// would be ignored. I don't think it's happening anymore, but something to keep in mind
// anyway.
return new Promise(function(resolve, reject) {
// Note: we manually handle closing the file so that we can
// force flusing it before close. This is needed because
// "end" might be called before the file has been flushed
// to disk, thus resulting in the calling code to find a
// file with size 0.
const destFile = fs.openSync(destFilePath, 'w');
const sourceStream = fs.createReadStream(sourceFilePath);
const destStream = fs.createWriteStream(destFile, {
fd: destFile,
autoClose: false,
});
sourceStream.pipe(new Base64Decode()).pipe(destStream);
// We wait for the destination stream "finish" event, not the source stream "end" event
// because even if the source has finished sending data, the destination might not have
// finished receiving it and writing it to disk.
destStream.on('finish', () => {
fs.fdatasyncSync(destFile);
fs.closeSync(destFile);
resolve(null);
});
sourceStream.on('error', (error: any) => reject(error));
destStream.on('error', (error: any) => reject(error));
});
}
async function md5FileAsync(filePath: string): Promise<string> {
return new Promise((resolve, reject) => {
md5File(filePath, (error: any, hash: string) => {
if (error) {
reject(error);
return;
}
resolve(hash);
});
});
2017-06-11 23:11:14 +02:00
}
function removeUndefinedProperties(note: NoteEntity) {
const output: any = {};
for (const n in note) {
2017-06-24 19:40:03 +02:00
if (!note.hasOwnProperty(n)) continue;
const v = (note as any)[n];
2017-06-24 19:40:03 +02:00
if (v === undefined || v === null) continue;
output[n] = v;
}
return output;
2017-06-11 23:11:14 +02:00
}
function levenshteinPercent(s1: string, s2: string) {
const l = new Levenshtein(s1, s2);
2017-07-12 22:39:47 +02:00
if (!s1.length || !s2.length) return 1;
return Math.abs(l.distance / s1.length);
}
async function fuzzyMatch(note: ExtractedNote) {
2017-07-12 22:39:47 +02:00
if (note.created_time < time.unixMs() - 1000 * 60 * 60 * 24 * 360) {
const notes = await Note.modelSelectAll('SELECT * FROM notes WHERE is_conflict = 0 AND created_time = ? AND title = ?', [note.created_time, note.title]);
2017-07-12 22:39:47 +02:00
return notes.length !== 1 ? null : notes[0];
}
const notes = await Note.modelSelectAll('SELECT * FROM notes WHERE is_conflict = 0 AND created_time = ?', [note.created_time]);
2017-07-12 22:39:47 +02:00
if (notes.length === 0) return null;
if (notes.length === 1) return notes[0];
let lowestL = 1;
let lowestN = null;
for (let i = 0; i < notes.length; i++) {
const n = notes[i];
const l = levenshteinPercent(note.title, n.title);
2017-07-12 22:39:47 +02:00
if (l < lowestL) {
lowestL = l;
lowestN = n;
}
}
if (lowestN && lowestL < 0.2) return lowestN;
return null;
2017-06-24 19:40:03 +02:00
}
interface ExtractedResource {
hasData?: boolean;
id?: string;
size?: number;
dataFilePath?: string;
dataEncoding?: string;
data?: string;
filename?: string;
sourceUrl?: string;
mime?: string;
title?: string;
}
interface ExtractedNote extends NoteEntity {
resources?: ExtractedResource[];
tags?: string[];
title?: string;
bodyXml?: string;
// is_todo?: boolean;
}
// At this point we have the resource has it's been parsed from the XML, but additional
// processing needs to be done to get the final resource file, its size, MD5, etc.
async function processNoteResource(resource: ExtractedResource) {
if (!resource.hasData) {
// Some resources have no data, go figure, so we need a special case for this.
resource.id = md5(Date.now() + Math.random());
resource.size = 0;
resource.dataFilePath = `${Setting.value('tempDir')}/${resource.id}.empty`;
await fs.writeFile(resource.dataFilePath, '');
} else {
if (resource.dataEncoding == 'base64') {
const decodedFilePath = `${resource.dataFilePath}.decoded`;
await decodeBase64File(resource.dataFilePath, decodedFilePath);
resource.dataFilePath = decodedFilePath;
} else if (resource.dataEncoding) {
throw new Error(`Cannot decode resource with encoding: ${resource.dataEncoding}`);
}
const stats = fs.statSync(resource.dataFilePath);
resource.size = stats.size;
if (!resource.id) {
// If no resource ID is present, the resource ID is actually the MD5 of the data.
// This ID will match the "hash" attribute of the corresponding <en-media> tag.
// resourceId = md5(decodedData);
resource.id = await md5FileAsync(resource.dataFilePath);
}
if (!resource.id || !resource.size) {
const debugTemp = Object.assign({}, resource);
debugTemp.data = debugTemp.data ? `${debugTemp.data.substr(0, 32)}...` : debugTemp.data;
throw new Error(`This resource was not added because it has no ID or no content: ${JSON.stringify(debugTemp)}`);
}
}
return resource;
}
async function saveNoteResources(note: ExtractedNote) {
2017-06-27 22:53:40 +02:00
let resourcesCreated = 0;
for (let i = 0; i < note.resources.length; i++) {
const resource = note.resources[i];
const toSave = Object.assign({}, resource);
delete toSave.dataFilePath;
delete toSave.dataEncoding;
delete toSave.hasData;
2017-06-27 22:53:40 +02:00
// The same resource sometimes appear twice in the same enex (exact same ID and file).
// In that case, just skip it - it means two different notes might be linked to the
// same resource.
const existingResource = await Resource.load(toSave.id);
2017-06-27 22:53:40 +02:00
if (existingResource) continue;
await fs.move(resource.dataFilePath, Resource.fullPath(toSave), { overwrite: true });
2017-07-19 21:15:55 +02:00
await Resource.save(toSave, { isNew: true });
2017-06-27 22:53:40 +02:00
resourcesCreated++;
}
return resourcesCreated;
}
async function saveNoteTags(note: ExtractedNote) {
2017-07-02 20:38:34 +02:00
let notesTagged = 0;
2017-07-02 17:46:03 +02:00
for (let i = 0; i < note.tags.length; i++) {
const tagTitle = note.tags[i];
2017-07-02 17:46:03 +02:00
let tag = await Tag.loadByTitle(tagTitle);
if (!tag) tag = await Tag.save({ title: tagTitle });
2017-07-02 17:46:03 +02:00
await Tag.addNote(tag.id, note.id);
2017-07-02 20:38:34 +02:00
notesTagged++;
2017-07-02 17:46:03 +02:00
}
2017-07-02 20:38:34 +02:00
return notesTagged;
2017-07-02 17:46:03 +02:00
}
interface ImportOptions {
fuzzyMatching?: boolean;
onProgress?: Function;
onError?: Function;
outputFormat?: string;
}
async function saveNoteToStorage(note: ExtractedNote, importOptions: ImportOptions) {
importOptions = Object.assign({}, {
fuzzyMatching: false,
}, importOptions);
note = Note.filter(note as any);
2017-06-25 13:39:42 +02:00
const existingNote = importOptions.fuzzyMatching ? await fuzzyMatch(note) : null;
2017-06-24 19:40:03 +02:00
const result = {
2017-06-26 21:12:49 +02:00
noteCreated: false,
noteUpdated: false,
2017-06-27 01:20:01 +02:00
noteSkipped: false,
2017-06-26 21:12:49 +02:00
resourcesCreated: 0,
2017-07-02 20:38:34 +02:00
notesTagged: 0,
2017-06-26 21:12:49 +02:00
};
const resourcesCreated = await saveNoteResources(note);
2017-06-27 22:53:40 +02:00
result.resourcesCreated += resourcesCreated;
const notesTagged = await saveNoteTags(note);
2017-07-02 20:38:34 +02:00
result.notesTagged += notesTagged;
2017-07-02 17:46:03 +02:00
2017-06-24 19:40:03 +02:00
if (existingNote) {
const diff = BaseModel.diffObjects(existingNote, note);
2017-06-24 19:40:03 +02:00
delete diff.tags;
delete diff.resources;
delete diff.id;
2017-06-27 01:20:01 +02:00
if (!Object.getOwnPropertyNames(diff).length) {
result.noteSkipped = true;
return result;
}
2017-06-24 19:40:03 +02:00
diff.id = existingNote.id;
diff.type_ = existingNote.type_;
2019-07-29 15:43:53 +02:00
await Note.save(diff, { autoTimestamp: false });
2017-06-27 01:20:01 +02:00
result.noteUpdated = true;
2017-06-24 19:40:03 +02:00
} else {
2017-06-27 01:20:01 +02:00
await Note.save(note, {
2017-06-24 20:51:43 +02:00
isNew: true,
autoTimestamp: false,
});
2017-06-27 01:20:01 +02:00
result.noteCreated = true;
2017-06-24 19:40:03 +02:00
}
2017-06-27 01:20:01 +02:00
return result;
2017-06-24 19:40:03 +02:00
}
interface Node {
name: string;
attributes: Record<string, any>;
}
interface NoteResourceRecognition {
objID?: string;
}
const preProcessFile = async (filePath: string): Promise<string> => {
const content: string = await shim.fsDriver().readFile(filePath, 'utf8');
// The note content in an ENEX file is wrapped in a CDATA block so it means
// that any "]]>" inside the note must be somehow escaped, or else the CDATA
// block would be closed at the wrong point.
//
// The problem is that Evernote appears to encode "]]>" as "]]<![CDATA[>]]>"
// instead of the more sensible "]]&gt;", or perhaps they have nothing in
// place to properly escape data imported from their web clipper. In any
// case it results in invalid XML that Evernote cannot even import back.
//
// Handling that invalid XML with SAX would also be very tricky, so instead
// we add a pre-processing step that converts this tags to just "&gt;". It
// should be safe to do so because such content can only be within the body
// of a note - and ">" or "&gt;" is equivalent.
//
// Ref: https://discourse.joplinapp.org/t/20470/4
const newContent = content.replace(/<!\[CDATA\[>\]\]>/g, '&gt;');
if (content === newContent) return filePath;
const newFilePath = `${Setting.value('tempDir')}/${md5(Date.now() + Math.random())}.enex`;
await shim.fsDriver().writeFile(newFilePath, newContent, 'utf8');
return newFilePath;
};
export default async function importEnex(parentFolderId: string, filePath: string, importOptions: ImportOptions = null) {
2017-06-25 13:39:42 +02:00
if (!importOptions) importOptions = {};
if (!('fuzzyMatching' in importOptions)) importOptions.fuzzyMatching = false;
if (!('onProgress' in importOptions)) importOptions.onProgress = function() {};
if (!('onError' in importOptions)) importOptions.onError = function() {};
2017-06-24 19:40:03 +02:00
function handleSaxStreamEvent(fn: Function) {
return function(...args: any[]) {
// Pass the parser to the wrapped function for debugging purposes
if (this._parser) (fn as any)._parser = this._parser;
try {
fn.call(this, ...args);
} catch (error) {
if (importOptions.onError) {
importOptions.onError(error);
} else {
console.error(error);
}
}
};
}
const fileToProcess = await preProcessFile(filePath);
const needToDeleteFileToProcess = fileToProcess !== filePath;
return new Promise((resolve) => {
const progressState = {
loaded: 0,
2017-06-26 21:12:49 +02:00
created: 0,
updated: 0,
2017-06-27 01:20:01 +02:00
skipped: 0,
2017-06-26 21:12:49 +02:00
resourcesCreated: 0,
2017-07-02 20:38:34 +02:00
notesTagged: 0,
};
const stream = fs.createReadStream(fileToProcess);
const options = {};
const strict = true;
const saxStream = require('@joplin/fork-sax').createStream(strict, options);
2017-06-09 00:24:40 +02:00
const nodes: Node[] = []; // LIFO list of nodes so that we know in which node we are in the onText event
let note: ExtractedNote = null;
let noteAttributes: Record<string, any> = null;
let noteResource: ExtractedResource = null;
let noteResourceAttributes: Record<string, any> = null;
let noteResourceRecognition: NoteResourceRecognition = null;
const notes: ExtractedNote[] = [];
let processingNotes = false;
2017-06-09 00:24:40 +02:00
const createErrorWithNoteTitle = (fnThis: any, error: any) => {
const line = [];
const parser = fnThis ? fnThis._parser : null;
if (parser) {
line.push(`Line ${parser.line}:${parser.column}`);
}
if (note && note.title) {
line.push(`"${note.title}"`);
}
line.push(error.message);
error.message = line.join(': ');
return error;
};
stream.on('error', function(error: any) {
importOptions.onError(createErrorWithNoteTitle(this, error));
2017-06-24 19:40:03 +02:00
});
2017-06-09 00:24:40 +02:00
function currentNodeName() {
if (!nodes.length) return null;
return nodes[nodes.length - 1].name;
}
2017-06-06 23:40:09 +02:00
2017-06-09 00:24:40 +02:00
function currentNodeAttributes() {
if (!nodes.length) return {};
return nodes[nodes.length - 1].attributes;
}
2017-06-06 23:40:09 +02:00
async function processNotes() {
2017-06-27 01:20:01 +02:00
if (processingNotes) return false;
processingNotes = true;
stream.pause();
while (notes.length) {
const note = notes.shift();
try {
for (let i = 0; i < note.resources.length; i++) {
let resource = note.resources[i];
try {
resource = await processNoteResource(resource);
} catch (error) {
importOptions.onError(createErrorWithNoteTitle(null, error));
continue;
}
note.resources[i] = resource;
}
const body = importOptions.outputFormat === 'html' ?
await enexXmlToHtml(note.bodyXml, note.resources) :
await enexXmlToMd(note.bodyXml, note.resources);
delete note.bodyXml;
note.markup_language = importOptions.outputFormat === 'html' ?
MarkupToHtml.MARKUP_LANGUAGE_HTML :
MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;
// console.info('*************************************************************************');
// console.info(body);
// console.info('*************************************************************************');
note.id = uuid.create();
note.parent_id = parentFolderId;
note.body = body;
// If the created timestamp was invalid, it would be
// set to zero, so set it to the current date here
if (!note.created_time) note.created_time = Date.now();
// Notes in enex files always have a created timestamp
// but not always an updated timestamp (it the note has
// never been modified). For sync we require an
// updated_time property, so set it to create_time in
// that case
if (!note.updated_time) note.updated_time = note.created_time;
const result = await saveNoteToStorage(note, importOptions);
if (result.noteUpdated) {
progressState.updated++;
} else if (result.noteCreated) {
progressState.created++;
} else if (result.noteSkipped) {
progressState.skipped++;
}
progressState.resourcesCreated += result.resourcesCreated;
progressState.notesTagged += result.notesTagged;
importOptions.onProgress(progressState);
} catch (error) {
const newError = wrapError(`Error on note "${note.title}"`, error);
importOptions.onError(createErrorWithNoteTitle(null, newError));
}
2017-06-09 00:24:40 +02:00
}
2017-06-06 23:40:09 +02:00
stream.resume();
processingNotes = false;
return true;
2017-06-09 00:24:40 +02:00
}
2017-06-07 21:21:04 +02:00
saxStream.on('error', function(error: any) {
importOptions.onError(createErrorWithNoteTitle(this, error));
});
2017-06-07 21:21:04 +02:00
saxStream.on('text', handleSaxStreamEvent(function(text: string) {
const n = currentNodeName();
2017-06-09 00:24:40 +02:00
if (noteAttributes) {
noteAttributes[n] = text;
} else if (noteResourceAttributes) {
2019-07-29 15:43:53 +02:00
noteResourceAttributes[n] = text;
2017-06-09 00:24:40 +02:00
} else if (noteResource) {
if (n == 'data') {
if (!noteResource.dataEncoding) {
const attr = currentNodeAttributes();
noteResource.dataEncoding = attr.encoding;
}
if (!noteResource.dataFilePath) {
noteResource.dataFilePath = `${Setting.value('tempDir')}/${md5(Date.now() + Math.random())}.base64`;
}
noteResource.hasData = true;
fs.appendFileSync(noteResource.dataFilePath, text);
} else {
if (!(n in noteResource)) (noteResource as any)[n] = '';
(noteResource as any)[n] += text;
2017-06-09 00:24:40 +02:00
}
} else if (note) {
if (n == 'title') {
2017-06-09 00:24:40 +02:00
note.title = text;
} else if (n == 'created') {
note.created_time = dateToTimestamp(text, 0);
} else if (n == 'updated') {
note.updated_time = dateToTimestamp(text, 0);
} else if (n == 'tag') {
2017-06-09 00:24:40 +02:00
note.tags.push(text);
} else if (n == 'note') {
// Ignore - white space between the opening tag <note> and the first sub-tag
} else if (n == 'content') {
// Ignore - white space between the opening tag <content> and the <![CDATA[< block where the content actually is
2017-06-25 01:19:11 +02:00
} else {
console.warn(createErrorWithNoteTitle(this, new Error(`Unsupported note tag: ${n}`)));
2017-06-09 00:24:40 +02:00
}
}
}));
2017-06-06 23:40:09 +02:00
saxStream.on('opentag', handleSaxStreamEvent(function(node: Node) {
const n = node.name.toLowerCase();
2017-06-09 00:24:40 +02:00
nodes.push(node);
if (n == 'note') {
2017-06-09 00:24:40 +02:00
note = {
resources: [],
tags: [],
bodyXml: '',
2017-06-09 00:24:40 +02:00
};
} else if (n == 'resource-attributes') {
2017-06-09 00:24:40 +02:00
noteResourceAttributes = {};
} else if (n == 'recognition') {
2017-06-09 00:24:40 +02:00
if (noteResource) noteResourceRecognition = {};
} else if (n == 'note-attributes') {
2017-06-09 00:24:40 +02:00
noteAttributes = {};
} else if (n == 'resource') {
noteResource = {
hasData: false,
};
2017-06-09 00:24:40 +02:00
}
}));
2017-06-06 23:40:09 +02:00
saxStream.on('cdata', handleSaxStreamEvent(function(data: any) {
const n = currentNodeName();
2017-06-06 23:40:09 +02:00
2017-06-09 00:24:40 +02:00
if (noteResourceRecognition) {
noteResourceRecognition.objID = extractRecognitionObjId(data);
} else if (note) {
if (n == 'content') {
note.bodyXml += data;
2017-06-09 00:24:40 +02:00
}
}
}));
2017-06-06 23:27:03 +02:00
saxStream.on('closetag', handleSaxStreamEvent(function(n: string) {
2017-06-09 00:24:40 +02:00
nodes.pop();
if (n == 'note') {
2017-06-24 19:40:03 +02:00
note = removeUndefinedProperties(note);
progressState.loaded++;
importOptions.onProgress(progressState);
2017-06-09 00:24:40 +02:00
notes.push(note);
2017-06-09 00:24:40 +02:00
if (notes.length >= 10) {
processNotes().catch(error => {
importOptions.onError(createErrorWithNoteTitle(this, error));
2017-06-09 00:24:40 +02:00
});
}
note = null;
} else if (n == 'recognition' && noteResource) {
2017-06-09 00:24:40 +02:00
noteResource.id = noteResourceRecognition.objID;
noteResourceRecognition = null;
} else if (n == 'resource-attributes') {
noteResource.filename = noteResourceAttributes['file-name'];
if (noteResourceAttributes['source-url']) noteResource.sourceUrl = noteResourceAttributes['source-url'];
2017-06-09 00:24:40 +02:00
noteResourceAttributes = null;
} else if (n == 'note-attributes') {
2017-06-09 00:24:40 +02:00
note.latitude = noteAttributes.latitude;
note.longitude = noteAttributes.longitude;
note.altitude = noteAttributes.altitude;
note.author = noteAttributes.author ? noteAttributes.author.trim() : '';
note.is_todo = noteAttributes['reminder-order'] !== '0' && !!noteAttributes['reminder-order'] as any;
note.todo_due = dateToTimestamp(noteAttributes['reminder-time'], 0);
note.todo_completed = dateToTimestamp(noteAttributes['reminder-done-time'], 0);
note.order = dateToTimestamp(noteAttributes['reminder-order'], 0);
note.source = noteAttributes.source ? `evernote.${noteAttributes.source.trim()}` : 'evernote';
note.source_url = noteAttributes['source-url'] ? noteAttributes['source-url'].trim() : '';
2017-06-25 01:19:11 +02:00
2017-06-09 00:24:40 +02:00
noteAttributes = null;
} else if (n == 'resource') {
let mimeType = noteResource.mime ? noteResource.mime.trim() : '';
// Evernote sometimes gives an invalid or generic
// "application/octet-stream" mime type for files that could
// have a valid mime type, based on the extension. So in
// general, we trust the filename more than the provided mime
// type.
// https://discourse.joplinapp.org/t/importing-a-note-with-a-zip-file/12123
if (noteResource.filename) {
const mimeTypeFromFile = mime.fromFilename(noteResource.filename);
if (mimeTypeFromFile && mimeTypeFromFile !== mimeType) {
// Don't print statement by default because it would show up in test units
// console.info(`Invalid mime type "${mimeType}" for resource "${noteResource.filename}". Using "${mimeTypeFromFile}" instead.`);
mimeType = mimeTypeFromFile;
}
}
note.resources.push({
id: noteResource.id,
dataFilePath: noteResource.dataFilePath,
dataEncoding: noteResource.dataEncoding,
mime: mimeType,
title: noteResource.filename ? noteResource.filename.trim() : '',
filename: noteResource.filename ? noteResource.filename.trim() : '',
hasData: noteResource.hasData,
});
2017-06-06 23:27:03 +02:00
2017-06-09 00:24:40 +02:00
noteResource = null;
}
}));
2017-06-06 23:27:03 +02:00
saxStream.on('end', handleSaxStreamEvent(function() {
// Wait till there is no more notes to process.
const iid = shim.setInterval(() => {
void processNotes().then(allDone => {
2017-06-27 01:20:01 +02:00
if (allDone) {
shim.clearTimeout(iid);
if (needToDeleteFileToProcess) void shim.fsDriver().remove(fileToProcess);
resolve(null);
2017-06-27 01:20:01 +02:00
}
});
}, 500);
}));
2017-06-06 23:27:03 +02:00
2017-06-09 00:24:40 +02:00
stream.pipe(saxStream);
});
}