Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384)

2024-12-24 10:27:10 +02:00 · 2022-06-20 18:26:54 +05:30 · 2022-06-20 18:26:54 +05:30 · c0bc4c38c3
commit c0bc4c38c3
parent 0c50a5ab9b
7 changed files with 217 additions and 35 deletions
--- a/packages/app-clipper/content_scripts/index.js
+++ b/packages/app-clipper/content_scripts/index.js
@ -32,6 +32,15 @@
 		}
 	}
 	function escapeHtml(s) {
 		return s
 			.replace(/&/g, '&amp;')
 			.replace(/</g, '&lt;')
 			.replace(/>/g, '&gt;')
 			.replace(/"/g, '&quot;')
 			.replace(/'/g, '&#039;');
 	}
 	function pageTitle() {
 		const titleElements = document.getElementsByTagName('title');
 		if (titleElements.length) return titleElements[0].text.trim();
@ -204,6 +213,16 @@
 					}
 				}
 				if (nodeName === 'embed') {
 					const src = absoluteUrl(node.src);
 					node.setAttribute('src', src);
 				}
 				if (nodeName === 'object') {
 					const data = absoluteUrl(node.data);
 					node.setAttribute('data', data);
 				}
 				cleanUpElement(convertToMarkup, node, imageSizes, imageIndexes);
 			}
 		}
@ -317,6 +336,9 @@
 	}
 	function readabilityProcess() {
 		if (isPagePdf()) throw new Error('Could not parse PDF document with Readability');
 		// eslint-disable-next-line no-undef
 		const readability = new Readability(documentForReadability());
 		const article = readability.parse();
@ -329,6 +351,14 @@
 		};
 	}
 	function isPagePdf() {
 		return document.contentType == 'application/pdf';
 	}
 	function embedPageUrl() {
 		return `<embed src="${escapeHtml(window.location.href)}" type="${escapeHtml(document.contentType)}" />`;
 	}
 	async function prepareCommandResponse(command) {
 		console.info(`Got command: ${command.name}`);
 		const shouldSendToJoplin = !!command.shouldSendToJoplin;
@ -375,6 +405,10 @@
 		} else if (command.name === 'completePageHtml') {
 			if (isPagePdf()) {
 				return clippedContentResponse(pageTitle(), embedPageUrl(), getImageSizes(document), getAnchorNames(document));
 			}
 			hardcodePreStyles(document);
 			addSvgClass(document);
 			preProcessDocument(document);
--- a/packages/lib/HtmlToMd.ts
+++ b/packages/lib/HtmlToMd.ts
@ -2,17 +2,20 @@ const TurndownService = require('@joplin/turndown');
 const turndownPluginGfm = require('@joplin/turndown-plugin-gfm').gfm;
 import markdownUtils from './markdownUtils';
 const pdfUrlRegex = /[\s\S]*?\.pdf$/i;
 export interface ParseOptions {
 	anchorNames?: string[];
 	preserveImageTagsWithSize?: boolean;
 	baseUrl?: string;
 	disableEscapeContent?: boolean;
 	convertEmbeddedPdfsToLinks?: boolean;
 }
 export default class HtmlToMd {
 	public parse(html: string, options: ParseOptions = {}) {
-		const turndown = new TurndownService({
+		const turndownOpts: any = {
 			headingStyle: 'atx',
 			anchorNames: options.anchorNames ? options.anchorNames.map(n => n.trim().toLowerCase()) : [],
 			codeBlockStyle: 'fenced',
@ -22,10 +25,36 @@ export default class HtmlToMd {
 			strongDelimiter: '**',
 			br: '',
 			disableEscapeContent: 'disableEscapeContent' in options ? options.disableEscapeContent : false,
-		});
+		};
 		if (options.convertEmbeddedPdfsToLinks) {
 			// Turndown ignores empty <object> tags, so we need to handle this case seperately
 			// https://github.com/mixmark-io/turndown/issues/293#issuecomment-588984202
 			turndownOpts.blankReplacement = (content: string, node: any) => {
 				if (node.matches('object')) {
 					return pdfRule.replacement(content, node, {});
 				}
 				return '\n\n';
 			};
 		}
 		const turndown = new TurndownService(turndownOpts);
 		turndown.use(turndownPluginGfm);
 		turndown.remove('script');
 		turndown.remove('style');
 		const pdfRule = {
 			filter: ['embed', 'object'],
 			replacement: function(_content: string, node: any, _options: any) {
 				// We are setting embedded_pdf as name so that we can later distingish them from normal links and create resources for them.
 				if (node.matches('embed') && node.getAttribute('src') && pdfUrlRegex.test(node.getAttribute('src'))) {
 					return `[embedded_pdf](${node.getAttribute('src')})`;
 				} else if (node.matches('object') && node.getAttribute('data') && pdfUrlRegex.test(node.getAttribute('data'))) {
 					return `[embedded_pdf](${node.getAttribute('data')})`;
 				}
 				return '';
 			},
 		};
 		if (options.convertEmbeddedPdfsToLinks) {
 			turndown.addRule('pdf', pdfRule);
 		}
 		let md = turndown.turndown(html);
 		if (options.baseUrl) md = markdownUtils.prependBaseUrl(md, options.baseUrl);
 		return md;
--- a/packages/lib/htmlUtils.ts
+++ b/packages/lib/htmlUtils.ts
@ -7,6 +7,9 @@ const { escapeHtml } = require('./string-utils.js');
 // https://stackoverflow.com/a/16119722/561309
 const imageRegex = /<img([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
 const anchorRegex = /<a([\s\S]*?)href=["']([\s\S]*?)["']([\s\S]*?)>/gi;
 const embedRegex = /<embed([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
 const objectRegex = /<object([\s\S]*?)data=["']([\s\S]*?)["']([\s\S]*?)>/gi;
 const pdfUrlRegex = /[\s\S]*?\.pdf$/i;
 const selfClosingElements = [
 	'area',
@ -61,6 +64,11 @@ class HtmlUtils {
 		return this.extractUrls(imageRegex, html);
 	}
 	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
 	public extractPdfUrls(html: string) {
 		return [...this.extractUrls(embedRegex, html), ...this.extractUrls(objectRegex, html)].filter(url => pdfUrlRegex.test(url));
 	}
 	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
 	public extractAnchorUrls(html: string) {
 		return this.extractUrls(anchorRegex, html);
@ -87,6 +95,27 @@ class HtmlUtils {
 		});
 	}
 	public replaceEmbedUrls(html: string, callback: Function) {
 		if (!html) return '';
 		// We are adding the link as <a> since joplin disabled <embed>, <object> tags due to security reasons.
 		// See: CVE-2020-15930
 		html = html.replace(embedRegex, (_v: string, _before: string, src: string, _after: string) => {
 			const link = callback(src);
 			return `<a href="${link}">${escapeHtml(src)}</a>`;
 		});
 		html = html.replace(objectRegex, (_v: string, _before: string, src: string, _after: string) => {
 			const link = callback(src);
 			return `<a href="${link}">${escapeHtml(src)}</a>`;
 		});
 		return html;
 	}
 	public replaceMediaUrls(html: string, callback: Function) {
 		html = this.replaceImageUrls(html, callback);
 		html = this.replaceEmbedUrls(html, callback);
 		return html;
 	}
 	// Note that the URLs provided by this function are URL-encoded, which is
 	// usually what you want for web URLs. But if they are file:// URLs and the
 	// file path is going to be used, it will need to be unescaped first. The
--- a/packages/lib/markdownUtils.ts
+++ b/packages/lib/markdownUtils.ts
@ -69,7 +69,7 @@ const markdownUtils = {
 	},
 	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
-	extractFileUrls(md: string, onlyImage: boolean = false): Array<string> {
+	extractFileUrls(md: string, onlyType: string = null): Array<string> {
 		const markdownIt = new MarkdownIt();
 		markdownIt.validateLink = validateLinks; // Necessary to support file:/// links
@ -77,10 +77,16 @@ const markdownUtils = {
 		const tokens = markdownIt.parse(md, env);
 		const output: string[] = [];
 		let linkType = onlyType;
 		if (linkType === 'pdf') linkType = 'link_open';
 		const searchUrls = (tokens: any[]) => {
 			for (let i = 0; i < tokens.length; i++) {
 				const token = tokens[i];
-				if ((onlyImage === true && token.type === 'image') || (onlyImage === false && (token.type === 'image' || token.type === 'link_open'))) {
+				if ((!onlyType && (token.type === 'link_open' || token.type === 'image')) || (!!onlyType && token.type === onlyType) || (onlyType == 'pdf' && token.type === 'link_open')) {
 					// Pdf embeds are a special case, they are represented as 'link_open' tokens but are marked with 'embedded_pdf' as link name by the parser
 					// We are making sure if its in the proper pdf link format, only then we add it to the list
 					if (onlyType === 'pdf' && !(tokens.length > i + 1 && tokens[i + 1].type === 'text' && tokens[i + 1].content === 'embedded_pdf')) continue;
 					for (let j = 0; j < token.attrs.length; j++) {
 						const a = token.attrs[j];
 						if ((a[0] === 'src' || a[0] === 'href') && a.length >= 2 && a[1]) {
@ -107,7 +113,11 @@ const markdownUtils = {
 	},
 	extractImageUrls(md: string) {
-		return markdownUtils.extractFileUrls(md,true);
+		return markdownUtils.extractFileUrls(md, 'image');
 	},
 	extractPdfUrls(md: string) {
 		return markdownUtils.extractFileUrls(md, 'pdf');
 	},
 	// The match results has 5 items
--- a/packages/lib/markupLanguageUtils.ts
+++ b/packages/lib/markupLanguageUtils.ts
@ -28,6 +28,17 @@ export class MarkupLanguageUtils {
 		return urls;
 	}
 	public extractPdfUrls(language: MarkupLanguage, text: string): string[] {
 		let urls: string[] = [];
 		if (language === MarkupLanguage.Any) {
 			urls = urls.concat(this.lib_(MarkupLanguage.Markdown).extractPdfUrls(text));
 			urls = urls.concat(this.lib_(MarkupLanguage.Html).extractPdfUrls(text));
 		} else {
 			urls = this.lib_(language).extractPdfUrls(text);
 		}
 		return urls;
 	}
 	// Create a new MarkupToHtml instance while injecting options specific to Joplin
 	// desktop and mobile applications.
 	public newMarkupToHtml(_plugins: PluginStates = null, options: Options = null) {
--- a/packages/lib/services/rest/Api.test.ts
+++ b/packages/lib/services/rest/Api.test.ts
@ -1,5 +1,6 @@
 import { PaginationOrderDir } from '../../models/utils/types';
 import Api, { RequestMethod } from '../../services/rest/Api';
 import { extractMediaUrls } from './routes/notes';
 import shim from '../../shim';
 import { setupDatabaseAndSynchronizer, switchClient, checkThrowAsync, db, msleep, supportDir } from '../../testing/test-utils';
 import Folder from '../../models/Folder';
@ -9,6 +10,7 @@ import Tag from '../../models/Tag';
 import NoteTag from '../../models/NoteTag';
 import ResourceService from '../../services/ResourceService';
 import SearchEngine from '../../services/searchengine/SearchEngine';
 const { MarkupToHtml } = require('@joplin/renderer');
 import { ResourceEntity } from '../database/types';
 const createFolderForPagination = async (num: number, time: number) => {
@ -452,6 +454,47 @@ describe('services_rest_Api', function() {
 		expect(response.body).toBe('**Bold text**');
 	}));
 	it('should extract media urls from body', (() => {
 		const tests = [
 			{
 				language: MarkupToHtml.MARKUP_LANGUAGE_HTML,
 				body: '<div> <img src="https://example.com/img.png" /> <embed src="https://example.com/sample.pdf"/> <object data="https://example.com/file.PDF"></object> </div>',
 				result: ['https://example.com/img.png', 'https://example.com/sample.pdf', 'https://example.com/file.PDF'],
 			},
 			{
 				language: MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN,
 				body: 'test text \n ![img 1](https://example.com/img1.png) [embedded_pdf](https://example.com/sample1.pdf) [embedded_pdf](https://example.com/file.PDF)',
 				result: ['https://example.com/img1.png', 'https://example.com/sample1.pdf', 'https://example.com/file.PDF'],
 			},
 			{
 				language: MarkupToHtml.MARKUP_LANGUAGE_HTML,
 				body: '<div> <embed src="https://example.com/sample"/> <embed /> <object data="https://example.com/file.pdfff"></object> <a href="https://test.com/file.pdf">Link</a> </div>',
 				result: [],
 			},
 		];
 		tests.forEach((test) => {
 			const urls = extractMediaUrls(test.language, test.body);
 			expect(urls).toEqual(test.result);
 		});
 	}));
 	it('should create notes with pdf embeds', (async () => {
 		let response = null;
 		const f = await Folder.save({ title: 'pdf test1' });
 		response = await api.route(RequestMethod.POST, 'notes', null, JSON.stringify({
 			title: 'testing PDF embeds',
 			parent_id: f.id,
 			body_html: `<div> <embed src="file://${supportDir}/welcome.pdf" type="application/pdf" /> </div>`,
 		}));
 		const resources = await Resource.all();
 		expect(resources.length).toBe(1);
 		const resource = resources[0];
 		expect(response.body.indexOf(resource.id) >= 0).toBe(true);
 	}));
 	it('should handle tokens', (async () => {
 		api = new Api('mytoken');
--- a/packages/lib/services/rest/routes/notes.ts
+++ b/packages/lib/services/rest/routes/notes.ts
@ -89,6 +89,7 @@ async function requestNoteToNote(requestNote: any) {
 			output.body = await htmlToMdParser().parse(`<div>${requestNote.body_html}</div>`, {
 				baseUrl: baseUrl,
 				anchorNames: requestNote.anchor_names ? requestNote.anchor_names : [],
 				convertEmbeddedPdfsToLinks: true,
 			});
 			output.markup_language = MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;
 		}
@ -143,19 +144,20 @@ async function buildNoteStyleSheet(stylesheets: any[]) {
 	return output;
 }
-async function tryToGuessImageExtFromMimeType(response: any, imagePath: string) {
+async function tryToGuessExtFromMimeType(response: any, mediaPath: string) {
 	const mimeType = mimeTypeFromHeaders(response.headers);
-	if (!mimeType) return imagePath;
+	if (!mimeType) return mediaPath;
 	const newExt = mimeUtils.toFileExtension(mimeType);
-	if (!newExt) return imagePath;
+	if (!newExt) return mediaPath;
-	const newImagePath = `${imagePath}.${newExt}`;
+	const newMediaPath = `${mediaPath}.${newExt}`;
-	await shim.fsDriver().move(imagePath, newImagePath);
+	await shim.fsDriver().move(mediaPath, newMediaPath);
-	return newImagePath;
+	return newMediaPath;
 }
-async function downloadImage(url: string /* , allowFileProtocolImages */) {
+async function downloadMediaFile(url: string /* , allowFileProtocolImages */) {
 	const tempDir = Setting.value('tempDir');
 	// The URL we get to download have been extracted from the Markdown document
@ -163,6 +165,12 @@ async function downloadImage(url: string /* , allowFileProtocolImages */) {
 	const isDataUrl = url && url.toLowerCase().indexOf('data:') === 0;
 	// PDFs and other heavy resoucres are often served as seperate files insted of data urls, its very unlikely to encounter a pdf as a data url
 	if (isDataUrl && !url.toLowerCase().startsWith('data:image/')) {
 		reg.logger().warn(`Resources in data URL format is only supported for images ${url}`);
 		return '';
 	}
 	const name = isDataUrl ? md5(`${Math.random()}_${Date.now()}`) : filename(url);
 	let fileExt = isDataUrl ? mimeUtils.toFileExtension(mimeUtils.fromDataUrl(url)) : safeFileExtension(fileExtension(url).toLowerCase());
 	if (!mimeUtils.fromFileExtension(fileExt)) fileExt = ''; // If the file extension is unknown - clear it.
@ -170,38 +178,38 @@ async function downloadImage(url: string /* , allowFileProtocolImages */) {
 	// Append a UUID because simply checking if the file exists is not enough since
 	// multiple resources can be downloaded at the same time (race condition).
-	let imagePath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`;
+	let mediaPath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`;
 	try {
 		if (isDataUrl) {
-			await shim.imageFromDataUrl(url, imagePath);
+			await shim.imageFromDataUrl(url, mediaPath);
 		} else if (urlUtils.urlProtocol(url).toLowerCase() === 'file:') {
 			// Can't think of any reason to disallow this at this point
 			// if (!allowFileProtocolImages) throw new Error('For security reasons, this URL with file:// protocol cannot be downloaded');
 			const localPath = fileUriToPath(url);
-			await shim.fsDriver().copy(localPath, imagePath);
+			await shim.fsDriver().copy(localPath, mediaPath);
 		} else {
-			const response = await shim.fetchBlob(url, { path: imagePath, maxRetry: 1 });
+			const response = await shim.fetchBlob(url, { path: mediaPath, maxRetry: 1 });
 			// If we could not find the file extension from the URL, try to get it
 			// now based on the Content-Type header.
-			if (!fileExt) imagePath = await tryToGuessImageExtFromMimeType(response, imagePath);
+			if (!fileExt) mediaPath = await tryToGuessExtFromMimeType(response, mediaPath);
 		}
-		return imagePath;
+		return mediaPath;
 	} catch (error) {
 		reg.logger().warn(`Cannot download image at ${url}`, error);
 		return '';
 	}
 }
-async function downloadImages(urls: string[] /* , allowFileProtocolImages:boolean */) {
+async function downloadMediaFiles(urls: string[] /* , allowFileProtocolImages:boolean */) {
 	const PromisePool = require('es6-promise-pool');
 	const output: any = {};
 	const downloadOne = async (url: string) => {
-		const imagePath = await downloadImage(url); // , allowFileProtocolImages);
+		const mediaPath = await downloadMediaFile(url); // , allowFileProtocolImages);
-		if (imagePath) output[url] = { path: imagePath, originalUrl: url };
+		if (mediaPath) output[url] = { path: mediaPath, originalUrl: url };
 	};
 	let urlIndex = 0;
@ -245,27 +253,38 @@ async function removeTempFiles(urls: string[]) {
 	}
 }
-function replaceImageUrlsByResources(markupLanguage: number, md: string, urls: any, imageSizes: any) {
+function replaceUrlsByResources(markupLanguage: number, md: string, urls: any, imageSizes: any) {
 	const imageSizesIndexes: any = {};
 	if (markupLanguage === MarkupToHtml.MARKUP_LANGUAGE_HTML) {
-		return htmlUtils.replaceImageUrls(md, (imageUrl: string) => {
+		return htmlUtils.replaceMediaUrls(md, (url: string) => {
-			const urlInfo: any = urls[imageUrl];
+			const urlInfo: any = urls[url];
-			if (!urlInfo || !urlInfo.resource) return imageUrl;
+			if (!urlInfo || !urlInfo.resource) return url;
 			return Resource.internalUrl(urlInfo.resource);
 		});
 	} else {
 		// eslint-disable-next-line no-useless-escape
-		return md.replace(/(!\[.*?\]\()([^\s\)]+)(.*?\))/g, (_match: any, before: string, imageUrl: string, after: string) => {
+		return md.replace(/(!?\[.*?\]\()([^\s\)]+)(.*?\))/g, (_match: any, before: string, url: string, after: string) => {
-			const urlInfo = urls[imageUrl];
+			let type = 'link';
-			if (!urlInfo || !urlInfo.resource) return before + imageUrl + after;
+			if (before.startsWith('[embedded_pdf]')) {
-			if (!(urlInfo.originalUrl in imageSizesIndexes)) imageSizesIndexes[urlInfo.originalUrl] = 0;
+				type = 'pdf';
 			} else if (before.startsWith('![')) {
 				type = 'image';
 			}
 			const urlInfo = urls[url];
 			if (type === 'link' || !urlInfo || !urlInfo.resource) return before + url + after;
 			const resourceUrl = Resource.internalUrl(urlInfo.resource);
-			const imageSizesCollection = imageSizes[urlInfo.originalUrl];
+			if (type === 'pdf') {
 				return `[${markdownUtils.escapeLinkUrl(url)}](${resourceUrl}${after}`;
 			}
 			if (!(urlInfo.originalUrl in imageSizesIndexes)) imageSizesIndexes[urlInfo.originalUrl] = 0;
 			const imageSizesCollection = imageSizes[urlInfo.originalUrl];
 			if (!imageSizesCollection) {
-				// In some cases, we won't find the image size information for that particular URL. Normally
+				// Either its not an image or we don't know the size of the image
 				// In some cases, we won't find the image size information for that particular image URL. Normally
 				// it will only happen when using the "Clip simplified page" feature, which can modify the
 				// image URLs (for example it will select a smaller size resolution). In that case, it's
 				// fine to return the image as-is because it has already good dimensions.
@ -284,6 +303,13 @@ function replaceImageUrlsByResources(markupLanguage: number, md: string, urls: a
 	}
 }
 export function extractMediaUrls(markupLanguage: number, text: string): string[] {
 	const urls: string[] = [];
 	urls.push(...ArrayUtils.unique(markupLanguageUtils.extractImageUrls(markupLanguage, text)));
 	urls.push(...ArrayUtils.unique(markupLanguageUtils.extractPdfUrls(markupLanguage, text)));
 	return urls;
 }
 // Note must have been saved first
 async function attachImageFromDataUrl(note: any, imageDataUrl: string, cropRect: any) {
 	const tempDir = Setting.value('tempDir');
@ -328,17 +354,17 @@ export default async function(request: Request, id: string = null, link: string
 		let note: any = await requestNoteToNote(requestNote);
-		const imageUrls = ArrayUtils.unique(markupLanguageUtils.extractImageUrls(note.markup_language, note.body));
+		const mediaUrls = extractMediaUrls(note.markup_language, note.body);
-		reg.logger().info(`Request (${requestId}): Downloading images: ${imageUrls.length}`);
+		reg.logger().info(`Request (${requestId}): Downloading media files: ${mediaUrls.length}`);
-		let result = await downloadImages(imageUrls); // , allowFileProtocolImages);
+		let result = await downloadMediaFiles(mediaUrls); // , allowFileProtocolImages);
 		reg.logger().info(`Request (${requestId}): Creating resources from paths: ${Object.getOwnPropertyNames(result).length}`);
 		result = await createResourcesFromPaths(result);
 		await removeTempFiles(result);
-		note.body = replaceImageUrlsByResources(note.markup_language, note.body, result, imageSizes);
+		note.body = replaceUrlsByResources(note.markup_language, note.body, result, imageSizes);
 		reg.logger().info(`Request (${requestId}): Saving note...`);