Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384)

2025-10-31 00:07:48 +02:00 · 2022-06-20 18:26:54 +05:30
parent 0c50a5ab9b
commit c0bc4c38c3
7 changed files with 217 additions and 35 deletions
--- a/packages/app-clipper/content_scripts/index.js
+++ b/packages/app-clipper/content_scripts/index.js
@@ -32,6 +32,15 @@
 		}
 	}

+	function escapeHtml(s) {
+		return s
+			.replace(/&/g, '&amp;')
+			.replace(/</g, '&lt;')
+			.replace(/>/g, '&gt;')
+			.replace(/"/g, '&quot;')
+			.replace(/'/g, '&#039;');
+	}
+
 	function pageTitle() {
 		const titleElements = document.getElementsByTagName('title');
 		if (titleElements.length) return titleElements[0].text.trim();
@@ -204,6 +213,16 @@
 					}
 				}

+				if (nodeName === 'embed') {
+					const src = absoluteUrl(node.src);
+					node.setAttribute('src', src);
+				}
+
+				if (nodeName === 'object') {
+					const data = absoluteUrl(node.data);
+					node.setAttribute('data', data);
+				}
+
 				cleanUpElement(convertToMarkup, node, imageSizes, imageIndexes);
 			}
 		}
@@ -317,6 +336,9 @@
 	}

 	function readabilityProcess() {
+
+		if (isPagePdf()) throw new Error('Could not parse PDF document with Readability');
+
 		// eslint-disable-next-line no-undef
 		const readability = new Readability(documentForReadability());
 		const article = readability.parse();
@@ -329,6 +351,14 @@
 		};
 	}

+	function isPagePdf() {
+		return document.contentType == 'application/pdf';
+	}
+
+	function embedPageUrl() {
+		return `<embed src="${escapeHtml(window.location.href)}" type="${escapeHtml(document.contentType)}" />`;
+	}
+
 	async function prepareCommandResponse(command) {
 		console.info(`Got command: ${command.name}`);
 		const shouldSendToJoplin = !!command.shouldSendToJoplin;
@@ -375,6 +405,10 @@

 		} else if (command.name === 'completePageHtml') {

+			if (isPagePdf()) {
+				return clippedContentResponse(pageTitle(), embedPageUrl(), getImageSizes(document), getAnchorNames(document));
+			}
+
 			hardcodePreStyles(document);
 			addSvgClass(document);
 			preProcessDocument(document);
--- a/packages/lib/HtmlToMd.ts
+++ b/packages/lib/HtmlToMd.ts
@@ -2,17 +2,20 @@ const TurndownService = require('@joplin/turndown');
 const turndownPluginGfm = require('@joplin/turndown-plugin-gfm').gfm;
 import markdownUtils from './markdownUtils';

+const pdfUrlRegex = /[\s\S]*?\.pdf$/i;
+
 export interface ParseOptions {
 	anchorNames?: string[];
 	preserveImageTagsWithSize?: boolean;
 	baseUrl?: string;
 	disableEscapeContent?: boolean;
+	convertEmbeddedPdfsToLinks?: boolean;
 }

 export default class HtmlToMd {

 	public parse(html: string, options: ParseOptions = {}) {
-		const turndown = new TurndownService({
+		const turndownOpts: any = {
 			headingStyle: 'atx',
 			anchorNames: options.anchorNames ? options.anchorNames.map(n => n.trim().toLowerCase()) : [],
 			codeBlockStyle: 'fenced',
@@ -22,10 +25,36 @@ export default class HtmlToMd {
 			strongDelimiter: '**',
 			br: '',
 			disableEscapeContent: 'disableEscapeContent' in options ? options.disableEscapeContent : false,
-		});
+		};
+		if (options.convertEmbeddedPdfsToLinks) {
+			// Turndown ignores empty <object> tags, so we need to handle this case seperately
+			// https://github.com/mixmark-io/turndown/issues/293#issuecomment-588984202
+			turndownOpts.blankReplacement = (content: string, node: any) => {
+				if (node.matches('object')) {
+					return pdfRule.replacement(content, node, {});
+				}
+				return '\n\n';
+			};
+		}
+		const turndown = new TurndownService(turndownOpts);
 		turndown.use(turndownPluginGfm);
 		turndown.remove('script');
 		turndown.remove('style');
+		const pdfRule = {
+			filter: ['embed', 'object'],
+			replacement: function(_content: string, node: any, _options: any) {
+				// We are setting embedded_pdf as name so that we can later distingish them from normal links and create resources for them.
+				if (node.matches('embed') && node.getAttribute('src') && pdfUrlRegex.test(node.getAttribute('src'))) {
+					return `[embedded_pdf](${node.getAttribute('src')})`;
+				} else if (node.matches('object') && node.getAttribute('data') && pdfUrlRegex.test(node.getAttribute('data'))) {
+					return `[embedded_pdf](${node.getAttribute('data')})`;
+				}
+				return '';
+			},
+		};
+		if (options.convertEmbeddedPdfsToLinks) {
+			turndown.addRule('pdf', pdfRule);
+		}
 		let md = turndown.turndown(html);
 		if (options.baseUrl) md = markdownUtils.prependBaseUrl(md, options.baseUrl);
 		return md;
--- a/packages/lib/htmlUtils.ts
+++ b/packages/lib/htmlUtils.ts
@@ -7,6 +7,9 @@ const { escapeHtml } = require('./string-utils.js');
 // https://stackoverflow.com/a/16119722/561309
 const imageRegex = /<img([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
 const anchorRegex = /<a([\s\S]*?)href=["']([\s\S]*?)["']([\s\S]*?)>/gi;
+const embedRegex = /<embed([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
+const objectRegex = /<object([\s\S]*?)data=["']([\s\S]*?)["']([\s\S]*?)>/gi;
+const pdfUrlRegex = /[\s\S]*?\.pdf$/i;

 const selfClosingElements = [
 	'area',
@@ -61,6 +64,11 @@ class HtmlUtils {
 		return this.extractUrls(imageRegex, html);
 	}

+	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
+	public extractPdfUrls(html: string) {
+		return [...this.extractUrls(embedRegex, html), ...this.extractUrls(objectRegex, html)].filter(url => pdfUrlRegex.test(url));
+	}
+
 	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
 	public extractAnchorUrls(html: string) {
 		return this.extractUrls(anchorRegex, html);
@@ -87,6 +95,27 @@ class HtmlUtils {
 		});
 	}

+	public replaceEmbedUrls(html: string, callback: Function) {
+		if (!html) return '';
+		// We are adding the link as <a> since joplin disabled <embed>, <object> tags due to security reasons.
+		// See: CVE-2020-15930
+		html = html.replace(embedRegex, (_v: string, _before: string, src: string, _after: string) => {
+			const link = callback(src);
+			return `<a href="${link}">${escapeHtml(src)}</a>`;
+		});
+		html = html.replace(objectRegex, (_v: string, _before: string, src: string, _after: string) => {
+			const link = callback(src);
+			return `<a href="${link}">${escapeHtml(src)}</a>`;
+		});
+		return html;
+	}
+
+	public replaceMediaUrls(html: string, callback: Function) {
+		html = this.replaceImageUrls(html, callback);
+		html = this.replaceEmbedUrls(html, callback);
+		return html;
+	}
+
 	// Note that the URLs provided by this function are URL-encoded, which is
 	// usually what you want for web URLs. But if they are file:// URLs and the
 	// file path is going to be used, it will need to be unescaped first. The
--- a/packages/lib/markdownUtils.ts
+++ b/packages/lib/markdownUtils.ts
@@ -69,7 +69,7 @@ const markdownUtils = {
 	},

 	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
-	extractFileUrls(md: string, onlyImage: boolean = false): Array<string> {
+	extractFileUrls(md: string, onlyType: string = null): Array<string> {
 		const markdownIt = new MarkdownIt();
 		markdownIt.validateLink = validateLinks; // Necessary to support file:/// links

@@ -77,10 +77,16 @@ const markdownUtils = {
 		const tokens = markdownIt.parse(md, env);
 		const output: string[] = [];

+		let linkType = onlyType;
+		if (linkType === 'pdf') linkType = 'link_open';
+
 		const searchUrls = (tokens: any[]) => {
 			for (let i = 0; i < tokens.length; i++) {
 				const token = tokens[i];
-				if ((onlyImage === true && token.type === 'image') || (onlyImage === false && (token.type === 'image' || token.type === 'link_open'))) {
+				if ((!onlyType && (token.type === 'link_open' || token.type === 'image')) || (!!onlyType && token.type === onlyType) || (onlyType == 'pdf' && token.type === 'link_open')) {
+					// Pdf embeds are a special case, they are represented as 'link_open' tokens but are marked with 'embedded_pdf' as link name by the parser
+					// We are making sure if its in the proper pdf link format, only then we add it to the list
+					if (onlyType === 'pdf' && !(tokens.length > i + 1 && tokens[i + 1].type === 'text' && tokens[i + 1].content === 'embedded_pdf')) continue;
 					for (let j = 0; j < token.attrs.length; j++) {
 						const a = token.attrs[j];
 						if ((a[0] === 'src' || a[0] === 'href') && a.length >= 2 && a[1]) {
@@ -107,7 +113,11 @@ const markdownUtils = {
 	},

 	extractImageUrls(md: string) {
-		return markdownUtils.extractFileUrls(md,true);
+		return markdownUtils.extractFileUrls(md, 'image');
+	},
+
+	extractPdfUrls(md: string) {
+		return markdownUtils.extractFileUrls(md, 'pdf');
 	},

 	// The match results has 5 items
--- a/packages/lib/markupLanguageUtils.ts
+++ b/packages/lib/markupLanguageUtils.ts
@@ -28,6 +28,17 @@ export class MarkupLanguageUtils {
 		return urls;
 	}

+	public extractPdfUrls(language: MarkupLanguage, text: string): string[] {
+		let urls: string[] = [];
+		if (language === MarkupLanguage.Any) {
+			urls = urls.concat(this.lib_(MarkupLanguage.Markdown).extractPdfUrls(text));
+			urls = urls.concat(this.lib_(MarkupLanguage.Html).extractPdfUrls(text));
+		} else {
+			urls = this.lib_(language).extractPdfUrls(text);
+		}
+		return urls;
+	}
+
 	// Create a new MarkupToHtml instance while injecting options specific to Joplin
 	// desktop and mobile applications.
 	public newMarkupToHtml(_plugins: PluginStates = null, options: Options = null) {
--- a/packages/lib/services/rest/Api.test.ts
+++ b/packages/lib/services/rest/Api.test.ts
@@ -1,5 +1,6 @@
 import { PaginationOrderDir } from '../../models/utils/types';
 import Api, { RequestMethod } from '../../services/rest/Api';
+import { extractMediaUrls } from './routes/notes';
 import shim from '../../shim';
 import { setupDatabaseAndSynchronizer, switchClient, checkThrowAsync, db, msleep, supportDir } from '../../testing/test-utils';
 import Folder from '../../models/Folder';
@@ -9,6 +10,7 @@ import Tag from '../../models/Tag';
 import NoteTag from '../../models/NoteTag';
 import ResourceService from '../../services/ResourceService';
 import SearchEngine from '../../services/searchengine/SearchEngine';
+const { MarkupToHtml } = require('@joplin/renderer');
 import { ResourceEntity } from '../database/types';

 const createFolderForPagination = async (num: number, time: number) => {
@@ -452,6 +454,47 @@ describe('services_rest_Api', function() {
 		expect(response.body).toBe('**Bold text**');
 	}));

+	it('should extract media urls from body', (() => {
+		const tests = [
+			{
+				language: MarkupToHtml.MARKUP_LANGUAGE_HTML,
+				body: '<div> <img src="https://example.com/img.png" /> <embed src="https://example.com/sample.pdf"/> <object data="https://example.com/file.PDF"></object> </div>',
+				result: ['https://example.com/img.png', 'https://example.com/sample.pdf', 'https://example.com/file.PDF'],
+			},
+			{
+				language: MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN,
+				body: 'test text \n ![img 1](https://example.com/img1.png) [embedded_pdf](https://example.com/sample1.pdf) [embedded_pdf](https://example.com/file.PDF)',
+				result: ['https://example.com/img1.png', 'https://example.com/sample1.pdf', 'https://example.com/file.PDF'],
+			},
+			{
+				language: MarkupToHtml.MARKUP_LANGUAGE_HTML,
+				body: '<div> <embed src="https://example.com/sample"/> <embed /> <object data="https://example.com/file.pdfff"></object> <a href="https://test.com/file.pdf">Link</a> </div>',
+				result: [],
+			},
+		];
+		tests.forEach((test) => {
+			const urls = extractMediaUrls(test.language, test.body);
+			expect(urls).toEqual(test.result);
+		});
+	}));
+
+	it('should create notes with pdf embeds', (async () => {
+		let response = null;
+		const f = await Folder.save({ title: 'pdf test1' });
+
+		response = await api.route(RequestMethod.POST, 'notes', null, JSON.stringify({
+			title: 'testing PDF embeds',
+			parent_id: f.id,
+			body_html: `<div> <embed src="file://${supportDir}/welcome.pdf" type="application/pdf" /> </div>`,
+		}));
+
+		const resources = await Resource.all();
+		expect(resources.length).toBe(1);
+
+		const resource = resources[0];
+		expect(response.body.indexOf(resource.id) >= 0).toBe(true);
+	}));
+
 	it('should handle tokens', (async () => {
 		api = new Api('mytoken');

--- a/packages/lib/services/rest/routes/notes.ts
+++ b/packages/lib/services/rest/routes/notes.ts
@@ -89,6 +89,7 @@ async function requestNoteToNote(requestNote: any) {
 			output.body = await htmlToMdParser().parse(`<div>${requestNote.body_html}</div>`, {
 				baseUrl: baseUrl,
 				anchorNames: requestNote.anchor_names ? requestNote.anchor_names : [],
+				convertEmbeddedPdfsToLinks: true,
 			});
 			output.markup_language = MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;
 		}
@@ -143,19 +144,20 @@ async function buildNoteStyleSheet(stylesheets: any[]) {
 	return output;
 }

-async function tryToGuessImageExtFromMimeType(response: any, imagePath: string) {
+async function tryToGuessExtFromMimeType(response: any, mediaPath: string) {
 	const mimeType = mimeTypeFromHeaders(response.headers);
-	if (!mimeType) return imagePath;
+	if (!mimeType) return mediaPath;

 	const newExt = mimeUtils.toFileExtension(mimeType);
-	if (!newExt) return imagePath;
+	if (!newExt) return mediaPath;

-	const newImagePath = `${imagePath}.${newExt}`;
-	await shim.fsDriver().move(imagePath, newImagePath);
-	return newImagePath;
+	const newMediaPath = `${mediaPath}.${newExt}`;
+	await shim.fsDriver().move(mediaPath, newMediaPath);
+	return newMediaPath;
 }

-async function downloadImage(url: string /* , allowFileProtocolImages */) {
+async function downloadMediaFile(url: string /* , allowFileProtocolImages */) {
+
 	const tempDir = Setting.value('tempDir');

 	// The URL we get to download have been extracted from the Markdown document
@@ -163,6 +165,12 @@ async function downloadImage(url: string /* , allowFileProtocolImages */) {

 	const isDataUrl = url && url.toLowerCase().indexOf('data:') === 0;

+	// PDFs and other heavy resoucres are often served as seperate files insted of data urls, its very unlikely to encounter a pdf as a data url
+	if (isDataUrl && !url.toLowerCase().startsWith('data:image/')) {
+		reg.logger().warn(`Resources in data URL format is only supported for images ${url}`);
+		return '';
+	}
+
 	const name = isDataUrl ? md5(`${Math.random()}_${Date.now()}`) : filename(url);
 	let fileExt = isDataUrl ? mimeUtils.toFileExtension(mimeUtils.fromDataUrl(url)) : safeFileExtension(fileExtension(url).toLowerCase());
 	if (!mimeUtils.fromFileExtension(fileExt)) fileExt = ''; // If the file extension is unknown - clear it.
@@ -170,38 +178,38 @@ async function downloadImage(url: string /* , allowFileProtocolImages */) {

 	// Append a UUID because simply checking if the file exists is not enough since
 	// multiple resources can be downloaded at the same time (race condition).
-	let imagePath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`;
+	let mediaPath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`;

 	try {
 		if (isDataUrl) {
-			await shim.imageFromDataUrl(url, imagePath);
+			await shim.imageFromDataUrl(url, mediaPath);
 		} else if (urlUtils.urlProtocol(url).toLowerCase() === 'file:') {
 			// Can't think of any reason to disallow this at this point
 			// if (!allowFileProtocolImages) throw new Error('For security reasons, this URL with file:// protocol cannot be downloaded');
 			const localPath = fileUriToPath(url);
-			await shim.fsDriver().copy(localPath, imagePath);
+			await shim.fsDriver().copy(localPath, mediaPath);
 		} else {
-			const response = await shim.fetchBlob(url, { path: imagePath, maxRetry: 1 });
+			const response = await shim.fetchBlob(url, { path: mediaPath, maxRetry: 1 });

 			// If we could not find the file extension from the URL, try to get it
 			// now based on the Content-Type header.
-			if (!fileExt) imagePath = await tryToGuessImageExtFromMimeType(response, imagePath);
+			if (!fileExt) mediaPath = await tryToGuessExtFromMimeType(response, mediaPath);
 		}
-		return imagePath;
+		return mediaPath;
 	} catch (error) {
 		reg.logger().warn(`Cannot download image at ${url}`, error);
 		return '';
 	}
 }

-async function downloadImages(urls: string[] /* , allowFileProtocolImages:boolean */) {
+async function downloadMediaFiles(urls: string[] /* , allowFileProtocolImages:boolean */) {
 	const PromisePool = require('es6-promise-pool');

 	const output: any = {};

 	const downloadOne = async (url: string) => {
-		const imagePath = await downloadImage(url); // , allowFileProtocolImages);
-		if (imagePath) output[url] = { path: imagePath, originalUrl: url };
+		const mediaPath = await downloadMediaFile(url); // , allowFileProtocolImages);
+		if (mediaPath) output[url] = { path: mediaPath, originalUrl: url };
 	};

 	let urlIndex = 0;
@@ -245,27 +253,38 @@ async function removeTempFiles(urls: string[]) {
 	}
 }

-function replaceImageUrlsByResources(markupLanguage: number, md: string, urls: any, imageSizes: any) {
+function replaceUrlsByResources(markupLanguage: number, md: string, urls: any, imageSizes: any) {
 	const imageSizesIndexes: any = {};

 	if (markupLanguage === MarkupToHtml.MARKUP_LANGUAGE_HTML) {
-		return htmlUtils.replaceImageUrls(md, (imageUrl: string) => {
-			const urlInfo: any = urls[imageUrl];
-			if (!urlInfo || !urlInfo.resource) return imageUrl;
+		return htmlUtils.replaceMediaUrls(md, (url: string) => {
+			const urlInfo: any = urls[url];
+			if (!urlInfo || !urlInfo.resource) return url;
 			return Resource.internalUrl(urlInfo.resource);
 		});
 	} else {
 		// eslint-disable-next-line no-useless-escape
-		return md.replace(/(!\[.*?\]\()([^\s\)]+)(.*?\))/g, (_match: any, before: string, imageUrl: string, after: string) => {
-			const urlInfo = urls[imageUrl];
-			if (!urlInfo || !urlInfo.resource) return before + imageUrl + after;
-			if (!(urlInfo.originalUrl in imageSizesIndexes)) imageSizesIndexes[urlInfo.originalUrl] = 0;
+		return md.replace(/(!?\[.*?\]\()([^\s\)]+)(.*?\))/g, (_match: any, before: string, url: string, after: string) => {
+			let type = 'link';
+			if (before.startsWith('[embedded_pdf]')) {
+				type = 'pdf';
+			} else if (before.startsWith('![')) {
+				type = 'image';
+			}
+
+			const urlInfo = urls[url];
+			if (type === 'link' || !urlInfo || !urlInfo.resource) return before + url + after;

 			const resourceUrl = Resource.internalUrl(urlInfo.resource);
-			const imageSizesCollection = imageSizes[urlInfo.originalUrl];
+			if (type === 'pdf') {
+				return `[${markdownUtils.escapeLinkUrl(url)}](${resourceUrl}${after}`;
+			}

+			if (!(urlInfo.originalUrl in imageSizesIndexes)) imageSizesIndexes[urlInfo.originalUrl] = 0;
+			const imageSizesCollection = imageSizes[urlInfo.originalUrl];
 			if (!imageSizesCollection) {
-				// In some cases, we won't find the image size information for that particular URL. Normally
+				// Either its not an image or we don't know the size of the image
+				// In some cases, we won't find the image size information for that particular image URL. Normally
 				// it will only happen when using the "Clip simplified page" feature, which can modify the
 				// image URLs (for example it will select a smaller size resolution). In that case, it's
 				// fine to return the image as-is because it has already good dimensions.
@@ -284,6 +303,13 @@ function replaceImageUrlsByResources(markupLanguage: number, md: string, urls: a
 	}
 }

+export function extractMediaUrls(markupLanguage: number, text: string): string[] {
+	const urls: string[] = [];
+	urls.push(...ArrayUtils.unique(markupLanguageUtils.extractImageUrls(markupLanguage, text)));
+	urls.push(...ArrayUtils.unique(markupLanguageUtils.extractPdfUrls(markupLanguage, text)));
+	return urls;
+}
+
 // Note must have been saved first
 async function attachImageFromDataUrl(note: any, imageDataUrl: string, cropRect: any) {
 	const tempDir = Setting.value('tempDir');
@@ -328,17 +354,17 @@ export default async function(request: Request, id: string = null, link: string

 		let note: any = await requestNoteToNote(requestNote);

-		const imageUrls = ArrayUtils.unique(markupLanguageUtils.extractImageUrls(note.markup_language, note.body));
+		const mediaUrls = extractMediaUrls(note.markup_language, note.body);

-		reg.logger().info(`Request (${requestId}): Downloading images: ${imageUrls.length}`);
+		reg.logger().info(`Request (${requestId}): Downloading media files: ${mediaUrls.length}`);

-		let result = await downloadImages(imageUrls); // , allowFileProtocolImages);
+		let result = await downloadMediaFiles(mediaUrls); // , allowFileProtocolImages);

 		reg.logger().info(`Request (${requestId}): Creating resources from paths: ${Object.getOwnPropertyNames(result).length}`);

 		result = await createResourcesFromPaths(result);
 		await removeTempFiles(result);
-		note.body = replaceImageUrlsByResources(note.markup_language, note.body, result, imageSizes);
+		note.body = replaceUrlsByResources(note.markup_language, note.body, result, imageSizes);

 		reg.logger().info(`Request (${requestId}): Saving note...`);