joplin/packages/lib/services/rest/routes/notes.ts

import Setting from '../../../models/Setting';
import shim from '../../../shim';
import uuid from '../../../uuid';
import readonlyProperties from '../utils/readonlyProperties';
import defaultSaveOptions from '../utils/defaultSaveOptions';
import defaultAction from '../utils/defaultAction';
import BaseModel, { ModelType } from '../../../BaseModel';
import defaultLoadOptions from '../utils/defaultLoadOptions';
import { RequestMethod, Request } from '../Api';
import markdownUtils from '../../../markdownUtils';
import collectionToPaginatedResults from '../utils/collectionToPaginatedResults';
import Database from '../../../database';
import Folder from '../../../models/Folder';
import Note from '../../../models/Note';
import Tag from '../../../models/Tag';
import Resource from '../../../models/Resource';
import htmlUtils from '../../../htmlUtils';
import markupLanguageUtils from '../../../markupLanguageUtils';
const mimeUtils = require('../../../mime-utils.js').mime;
const md5 = require('md5');
import HtmlToMd from '../../../HtmlToMd';
const urlUtils = require('../../../urlUtils.js');
import * as ArrayUtils from '../../../ArrayUtils';
import Logger from '@joplin/utils/Logger';
const { mimeTypeFromHeaders } = require('../../../net-utils');
const { fileExtension, safeFileExtension, safeFilename, filename } = require('../../../path-utils');
const { MarkupToHtml } = require('@joplin/renderer');
const { ErrorNotFound } = require('../utils/errors');
import { fileUriToPath } from '@joplin/utils/url';
import { NoteEntity } from '../../database/types';

const logger = Logger.create('routes/notes');

let htmlToMdParser_: any = null;

function htmlToMdParser() {
	if (htmlToMdParser_) return htmlToMdParser_;
	htmlToMdParser_ = new HtmlToMd();
	return htmlToMdParser_;
}

type RequestNote = {
	id?: any;
	parent_id?: string;
	title: string;
	body?: string;
	latitude?: number;
	longitude?: number;
	altitude?: number;
	author?: string;
	source_url?: string;
	is_todo?: number;
	todo_due?: number;
	todo_completed?: number;
	user_updated_time?: number;
	user_created_time?: number;
	markup_language?: number;
	body_html: string;
	base_url?: string;
	convert_to: string;
	anchor_names?: any[];
	image_sizes?: object;
	stylesheets: any;
};

type FetchOptions = {
	timeout?: number;
	maxRedirects?: number;
};

async function requestNoteToNote(requestNote: RequestNote): Promise<NoteEntity> {
	const output: any = {
		title: requestNote.title ? requestNote.title : '',
		body: requestNote.body ? requestNote.body : '',
	};

	if (requestNote.id) output.id = requestNote.id;

	const baseUrl = requestNote.base_url ? requestNote.base_url : '';

	if (requestNote.body_html) {
		if (requestNote.convert_to === 'html') {
			const style = await buildNoteStyleSheet(requestNote.stylesheets);
			const minify = require('html-minifier').minify;

			const minifyOptions = {
				// Remove all spaces and, especially, newlines from tag attributes, as that would
				// break the rendering.
				customAttrCollapse: /.*/,
				// Need to remove all whitespaces because whitespace at a beginning of a line
				// means a code block in Markdown.
				collapseWhitespace: true,
				minifyCSS: true,
				maxLineLength: 300,
			};

			const uglifycss = require('uglifycss');
			const styleString = uglifycss.processString(style.join('\n'), {
				// Need to set a max length because Ace Editor takes forever
				// to display notes with long lines.
				maxLineLen: 200,
			});

			const styleTag = style.length ? `<style>${styleString}</style>` + '\n' : '';
			let minifiedHtml = '';
			try {
				minifiedHtml = minify(requestNote.body_html, minifyOptions);
			} catch (error) {
				console.warn('Could not minify HTML - using non-minified HTML instead', error);
				minifiedHtml = requestNote.body_html;
			}
			output.body = styleTag + minifiedHtml;
			output.body = htmlUtils.prependBaseUrl(output.body, baseUrl);
			output.markup_language = MarkupToHtml.MARKUP_LANGUAGE_HTML;
		} else {
			// Convert to Markdown
			// Parsing will not work if the HTML is not wrapped in a top level tag, which is not guaranteed
			// when getting the content from elsewhere. So here wrap it - it won't change anything to the final
			// rendering but it makes sure everything will be parsed.
			output.body = await htmlToMdParser().parse(`<div>${requestNote.body_html}</div>`, {
				baseUrl: baseUrl,
				anchorNames: requestNote.anchor_names ? requestNote.anchor_names : [],
				convertEmbeddedPdfsToLinks: true,
			});
			output.markup_language = MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;
		}
	}

	if (requestNote.parent_id) {
		output.parent_id = requestNote.parent_id;
	} else {
		const folder = await Folder.defaultFolder();
		if (!folder) throw new Error('Cannot find folder for note');
		output.parent_id = folder.id;
	}

	if ('source_url' in requestNote) output.source_url = requestNote.source_url;
	if ('author' in requestNote) output.author = requestNote.author;
	if ('user_updated_time' in requestNote) output.user_updated_time = Database.formatValue(Database.TYPE_INT, requestNote.user_updated_time);
	if ('user_created_time' in requestNote) output.user_created_time = Database.formatValue(Database.TYPE_INT, requestNote.user_created_time);
	if ('is_todo' in requestNote) output.is_todo = Database.formatValue(Database.TYPE_INT, requestNote.is_todo);
	if ('todo_due' in requestNote) output.todo_due = Database.formatValue(Database.TYPE_INT, requestNote.todo_due);
	if ('todo_completed' in requestNote) output.todo_completed = Database.formatValue(Database.TYPE_INT, requestNote.todo_completed);
	if ('markup_language' in requestNote) output.markup_language = Database.formatValue(Database.TYPE_INT, requestNote.markup_language);
	if ('longitude' in requestNote) output.longitude = requestNote.longitude;
	if ('latitude' in requestNote) output.latitude = requestNote.latitude;
	if ('altitude' in requestNote) output.altitude = requestNote.altitude;

	if (!output.markup_language) output.markup_language = MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;

	return output;
}

async function buildNoteStyleSheet(stylesheets: any[]) {
	if (!stylesheets) return [];

	const output = [];

	for (const stylesheet of stylesheets) {
		if (stylesheet.type === 'text') {
			output.push(stylesheet.value);
		} else if (stylesheet.type === 'url') {
			try {
				const tempPath = `${Setting.value('tempDir')}/${md5(`${Math.random()}_${Date.now()}`)}.css`;
				await shim.fetchBlob(stylesheet.value, { path: tempPath, maxRetry: 1 });
				const text = await shim.fsDriver().readFile(tempPath);
				output.push(text);
				await shim.fsDriver().remove(tempPath);
			} catch (error) {
				logger.warn(`Cannot download stylesheet at ${stylesheet.value}`, error);
			}
		} else {
			throw new Error(`Invalid stylesheet type: ${stylesheet.type}`);
		}
	}

	return output;
}

async function tryToGuessExtFromMimeType(response: any, mediaPath: string) {
	const mimeType = mimeTypeFromHeaders(response.headers);
	if (!mimeType) return mediaPath;

	const newExt = mimeUtils.toFileExtension(mimeType);
	if (!newExt) return mediaPath;

	const newMediaPath = `${mediaPath}.${newExt}`;
	await shim.fsDriver().move(mediaPath, newMediaPath);
	return newMediaPath;
}


const getFileExtension = (url: string, isDataUrl: boolean) => {
	let fileExt = isDataUrl ? mimeUtils.toFileExtension(mimeUtils.fromDataUrl(url)) : safeFileExtension(fileExtension(url).toLowerCase());
	if (!mimeUtils.fromFileExtension(fileExt)) fileExt = ''; // If the file extension is unknown - clear it.
	if (fileExt) fileExt = `.${fileExt}`;

	return fileExt;
};

const generateMediaPath = (url: string, isDataUrl: boolean, fileExt: string) => {
	const tempDir = Setting.value('tempDir');
	const name = isDataUrl ? md5(`${Math.random()}_${Date.now()}`) : filename(url);
	// Append a UUID because simply checking if the file exists is not enough since
	// multiple resources can be downloaded at the same time (race condition).
	const mediaPath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`;
	return mediaPath;
};

const isValidUrl = (url: string, isDataUrl: boolean, urlProtocol?: string, allowedProtocols?: string[]) => {
	if (!urlProtocol) return false;

	// PDFs and other heavy resoucres are often served as seperate files insted of data urls, its very unlikely to encounter a pdf as a data url
	if (isDataUrl && !url.toLowerCase().startsWith('data:image/')) {
		logger.warn(`Resources in data URL format is only supported for images ${url}`);
		return false;
	}

	const defaultAllowedProtocols = ['http:', 'https:', 'data:'];
	const allowed = allowedProtocols ?? defaultAllowedProtocols;
	const isAllowedProtocol = allowed.includes(urlProtocol);

	return isAllowedProtocol;
};

export async function downloadMediaFile(url: string, fetchOptions?: FetchOptions, allowedProtocols?: string[]) {
	logger.info('Downloading media file', url);

	// The URL we get to download have been extracted from the Markdown document
	url = markdownUtils.unescapeLinkUrl(url);

	const isDataUrl = url && url.toLowerCase().indexOf('data:') === 0;
	const urlProtocol = urlUtils.urlProtocol(url)?.toLowerCase();

	if (!isValidUrl(url, isDataUrl, urlProtocol, allowedProtocols)) {
		return '';
	}

	const fileExt = getFileExtension(url, isDataUrl);
	const mediaPath = generateMediaPath(url, isDataUrl, fileExt);
	let newMediaPath = undefined;

	try {
		if (isDataUrl) {
			await shim.imageFromDataUrl(url, mediaPath);
		} else if (urlProtocol === 'file:') {
			const localPath = fileUriToPath(url);
			await shim.fsDriver().copy(localPath, mediaPath);
		} else {
			const response = await shim.fetchBlob(url, { path: mediaPath, maxRetry: 1, ...fetchOptions });

			if (!fileExt) {
				// If we could not find the file extension from the URL, try to get it
				// now based on the Content-Type header.
				newMediaPath = await tryToGuessExtFromMimeType(response, mediaPath);
			}
		}
		return newMediaPath ?? mediaPath;
	} catch (error) {
		logger.warn(`Cannot download image at ${url}`, error);
		return '';
	}
}

async function downloadMediaFiles(urls: string[], fetchOptions?: FetchOptions, allowedProtocols?: string[]) {
	const PromisePool = require('es6-promise-pool');

	const output: any = {};

	const downloadOne = async (url: string) => {
		const mediaPath = await downloadMediaFile(url, fetchOptions, allowedProtocols);
		if (mediaPath) output[url] = { path: mediaPath, originalUrl: url };
	};

	let urlIndex = 0;
	const promiseProducer = () => {
		if (urlIndex >= urls.length) return null;

		const url = urls[urlIndex++];
		return downloadOne(url);
	};

	const concurrency = 10;
	const pool = new PromisePool(promiseProducer, concurrency);
	await pool.start();

	return output;
}

async function createResourcesFromPaths(urls: string[]) {
	for (const url in urls) {
		if (!urls.hasOwnProperty(url)) continue;
		const urlInfo: any = urls[url];
		try {
			const resource = await shim.createResourceFromPath(urlInfo.path);
			urlInfo.resource = resource;
		} catch (error) {
			logger.warn(`Cannot create resource for ${url}`, error);
		}
	}
	return urls;
}

async function removeTempFiles(urls: string[]) {
	for (const url in urls) {
		if (!urls.hasOwnProperty(url)) continue;
		const urlInfo: any = urls[url];
		try {
			await shim.fsDriver().remove(urlInfo.path);
		} catch (error) {
			logger.warn(`Cannot remove ${urlInfo.path}`, error);
		}
	}
}

function replaceUrlsByResources(markupLanguage: number, md: string, urls: any, imageSizes: any) {
	const imageSizesIndexes: any = {};

	if (markupLanguage === MarkupToHtml.MARKUP_LANGUAGE_HTML) {
		return htmlUtils.replaceMediaUrls(md, (url: string) => {
			const urlInfo: any = urls[url];
			if (!urlInfo || !urlInfo.resource) return url;
			return Resource.internalUrl(urlInfo.resource);
		});
	} else {
		// Proper Regex:
		//
		//     /(!\[.*?\]\()([^\s\)]+)(.*?\))/g
		//
		// Broken regex when [embedded_pdf] support was added, and fixed with
		// `before.startsWith('[![')` hack. But ideally that function should be
		// unit tested to prevent it from being broken again.
		//
		//     /(!?\[.*?\]\()([^\s\)]+)(.*?\))/g
		//
		// eslint-disable-next-line no-useless-escape
		return md.replace(/(!?\[.*?\]\()([^\s\)]+)(.*?\))/g, (_match: any, before: string, url: string, after: string) => {
			let type = 'link';
			if (before.startsWith('[embedded_pdf]')) {
				type = 'pdf';
			} else if (before.startsWith('![') || before.startsWith('[![')) {
				type = 'image';
			}

			const urlInfo = urls[url];
			if (type === 'link' || !urlInfo || !urlInfo.resource) return before + url + after;

			const resourceUrl = Resource.internalUrl(urlInfo.resource);
			if (type === 'pdf') {
				return `[${markdownUtils.escapeLinkUrl(url)}](${resourceUrl}${after}`;
			}

			if (!(urlInfo.originalUrl in imageSizesIndexes)) imageSizesIndexes[urlInfo.originalUrl] = 0;
			const imageSizesCollection = imageSizes[urlInfo.originalUrl];
			if (!imageSizesCollection) {
				// Either its not an image or we don't know the size of the image
				// In some cases, we won't find the image size information for that particular image URL. Normally
				// it will only happen when using the "Clip simplified page" feature, which can modify the
				// image URLs (for example it will select a smaller size resolution). In that case, it's
				// fine to return the image as-is because it has already good dimensions.
				return before + resourceUrl + after;
			}

			const imageSize = imageSizesCollection[imageSizesIndexes[urlInfo.originalUrl]];
			imageSizesIndexes[urlInfo.originalUrl]++;

			if (imageSize && (imageSize.naturalWidth !== imageSize.width || imageSize.naturalHeight !== imageSize.height)) {
				return `<img width="${imageSize.width}" height="${imageSize.height}" src="${resourceUrl}"/>`;
			} else {
				return before + resourceUrl + after;
			}
		});
	}
}

export function extractMediaUrls(markupLanguage: number, text: string): string[] {
	const urls: string[] = [];
	urls.push(...ArrayUtils.unique(markupLanguageUtils.extractImageUrls(markupLanguage, text)));
	urls.push(...ArrayUtils.unique(markupLanguageUtils.extractPdfUrls(markupLanguage, text)));
	return urls;
}

// Note must have been saved first
async function attachImageFromDataUrl(note: any, imageDataUrl: string, cropRect: any) {
	const tempDir = Setting.value('tempDir');
	const mime = mimeUtils.fromDataUrl(imageDataUrl);
	let ext = mimeUtils.toFileExtension(mime) || '';
	if (ext) ext = `.${ext}`;
	const tempFilePath = `${tempDir}/${md5(`${Math.random()}_${Date.now()}`)}${ext}`;
	const imageConvOptions: any = {};
	if (cropRect) imageConvOptions.cropRect = cropRect;
	await shim.imageFromDataUrl(imageDataUrl, tempFilePath, imageConvOptions);
	return await shim.attachFileToNote(note, tempFilePath);
}

export const extractNoteFromHTML = async (
	requestNote: RequestNote,
	requestId: number,
	imageSizes: any,
	fetchOptions?: FetchOptions,
	allowedProtocols?: string[],
) => {
	const note = await requestNoteToNote(requestNote);

	const mediaUrls = extractMediaUrls(note.markup_language, note.body);

	logger.info(`Request (${requestId}): Downloading media files: ${mediaUrls.length}`);

	const mediaFiles = await downloadMediaFiles(mediaUrls, fetchOptions, allowedProtocols);

	logger.info(`Request (${requestId}): Creating resources from paths: ${Object.getOwnPropertyNames(mediaFiles).length}`);

	const resources = await createResourcesFromPaths(mediaFiles);
	await removeTempFiles(resources);
	note.body = replaceUrlsByResources(note.markup_language, note.body, resources, imageSizes);

	logger.info(`Request (${requestId}): Saving note...`);

	const saveOptions = defaultSaveOptions('POST', note.id);
	saveOptions.autoTimestamp = false; // No auto-timestamp because user may have provided them
	const timestamp = Date.now();
	note.updated_time = timestamp;
	note.created_time = timestamp;
	if (!('user_updated_time' in note)) note.user_updated_time = timestamp;
	if (!('user_created_time' in note)) note.user_created_time = timestamp;

	return { note, saveOptions, resources };
};

export default async function(request: Request, id: string = null, link: string = null) {
	if (request.method === 'GET') {
		if (link && link === 'tags') {
			return collectionToPaginatedResults(ModelType.Tag, await Tag.tagsByNoteId(id), request);
		} else if (link && link === 'resources') {
			const note = await Note.load(id);
			if (!note) throw new ErrorNotFound();
			const resourceIds = await Note.linkedResourceIds(note.body);
			const output = [];
			const loadOptions = defaultLoadOptions(request, BaseModel.TYPE_RESOURCE);
			for (const resourceId of resourceIds) {
				output.push(await Resource.load(resourceId, loadOptions));
			}
			return collectionToPaginatedResults(ModelType.Resource, output, request);
		} else if (link) {
			throw new ErrorNotFound();
		}

		return defaultAction(BaseModel.TYPE_NOTE, request, id, link);
	}

	if (request.method === RequestMethod.POST) {
		const requestId = Date.now();
		const requestNote = JSON.parse(request.body);

		// const allowFileProtocolImages = urlUtils.urlProtocol(requestNote.base_url).toLowerCase() === 'file:';

		const imageSizes = requestNote.image_sizes ? requestNote.image_sizes : {};

		logger.info('Images:', imageSizes);

		const allowedProtocolsForDownloadMediaFiles = ['http:', 'https:', 'file:', 'data:'];
		const extracted = await extractNoteFromHTML(requestNote, requestId, imageSizes, undefined, allowedProtocolsForDownloadMediaFiles);

		let note = await Note.save(extracted.note, extracted.saveOptions);

		if (requestNote.tags) {
			const tagTitles = requestNote.tags.split(',');
			await Tag.setNoteTagsByTitles(note.id, tagTitles);
		}

		if (requestNote.image_data_url) {
			note = await attachImageFromDataUrl(note, requestNote.image_data_url, requestNote.crop_rect);
		}

		logger.info(`Request (${requestId}): Created note ${note.id}`);

		return note;
	}

	if (request.method === 'PUT') {
		const note = await Note.load(id);

		if (!note) throw new ErrorNotFound();

		const saveOptions = {
			...defaultSaveOptions('PUT', note.id),
			autoTimestamp: false, // No auto-timestamp because user may have provided them
			userSideValidation: true,
		};

		const timestamp = Date.now();

		const newProps = request.bodyJson(readonlyProperties('PUT'));
		if (!('user_updated_time' in newProps)) newProps.user_updated_time = timestamp;

		let newNote = {
			...note,
			...newProps,
			updated_time: timestamp,
		};

		newNote = await Note.save(newNote, saveOptions);

		const requestNote = JSON.parse(request.body);
		if (requestNote.tags || requestNote.tags === '') {
			const tagTitles = requestNote.tags.split(',');
			await Tag.setNoteTagsByTitles(id, tagTitles);
		}

		return newNote;
	}

	return defaultAction(BaseModel.TYPE_NOTE, request, id, link);
}