From c0bc4c38c341410f78ddff3417402cda6489faa5 Mon Sep 17 00:00:00 2001 From: asrient <44570278+asrient@users.noreply.github.com> Date: Mon, 20 Jun 2022 18:26:54 +0530 Subject: [PATCH] Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384) --- packages/app-clipper/content_scripts/index.js | 34 ++++++++ packages/lib/HtmlToMd.ts | 33 ++++++- packages/lib/htmlUtils.ts | 29 +++++++ packages/lib/markdownUtils.ts | 16 +++- packages/lib/markupLanguageUtils.ts | 11 +++ packages/lib/services/rest/Api.test.ts | 43 ++++++++++ packages/lib/services/rest/routes/notes.ts | 86 ++++++++++++------- 7 files changed, 217 insertions(+), 35 deletions(-) diff --git a/packages/app-clipper/content_scripts/index.js b/packages/app-clipper/content_scripts/index.js index 052b9130e..3ccc01b1f 100644 --- a/packages/app-clipper/content_scripts/index.js +++ b/packages/app-clipper/content_scripts/index.js @@ -32,6 +32,15 @@ } } + function escapeHtml(s) { + return s + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + } + function pageTitle() { const titleElements = document.getElementsByTagName('title'); if (titleElements.length) return titleElements[0].text.trim(); @@ -204,6 +213,16 @@ } } + if (nodeName === 'embed') { + const src = absoluteUrl(node.src); + node.setAttribute('src', src); + } + + if (nodeName === 'object') { + const data = absoluteUrl(node.data); + node.setAttribute('data', data); + } + cleanUpElement(convertToMarkup, node, imageSizes, imageIndexes); } } @@ -317,6 +336,9 @@ } function readabilityProcess() { + + if (isPagePdf()) throw new Error('Could not parse PDF document with Readability'); + // eslint-disable-next-line no-undef const readability = new Readability(documentForReadability()); const article = readability.parse(); @@ -329,6 +351,14 @@ }; } + function isPagePdf() { + return document.contentType == 'application/pdf'; + } + + function embedPageUrl() { + return ``; + } + async function prepareCommandResponse(command) { console.info(`Got command: ${command.name}`); const shouldSendToJoplin = !!command.shouldSendToJoplin; @@ -375,6 +405,10 @@ } else if (command.name === 'completePageHtml') { + if (isPagePdf()) { + return clippedContentResponse(pageTitle(), embedPageUrl(), getImageSizes(document), getAnchorNames(document)); + } + hardcodePreStyles(document); addSvgClass(document); preProcessDocument(document); diff --git a/packages/lib/HtmlToMd.ts b/packages/lib/HtmlToMd.ts index e38f0d090..0c7361aef 100644 --- a/packages/lib/HtmlToMd.ts +++ b/packages/lib/HtmlToMd.ts @@ -2,17 +2,20 @@ const TurndownService = require('@joplin/turndown'); const turndownPluginGfm = require('@joplin/turndown-plugin-gfm').gfm; import markdownUtils from './markdownUtils'; +const pdfUrlRegex = /[\s\S]*?\.pdf$/i; + export interface ParseOptions { anchorNames?: string[]; preserveImageTagsWithSize?: boolean; baseUrl?: string; disableEscapeContent?: boolean; + convertEmbeddedPdfsToLinks?: boolean; } export default class HtmlToMd { public parse(html: string, options: ParseOptions = {}) { - const turndown = new TurndownService({ + const turndownOpts: any = { headingStyle: 'atx', anchorNames: options.anchorNames ? options.anchorNames.map(n => n.trim().toLowerCase()) : [], codeBlockStyle: 'fenced', @@ -22,10 +25,36 @@ export default class HtmlToMd { strongDelimiter: '**', br: '', disableEscapeContent: 'disableEscapeContent' in options ? options.disableEscapeContent : false, - }); + }; + if (options.convertEmbeddedPdfsToLinks) { + // Turndown ignores empty tags, so we need to handle this case seperately + // https://github.com/mixmark-io/turndown/issues/293#issuecomment-588984202 + turndownOpts.blankReplacement = (content: string, node: any) => { + if (node.matches('object')) { + return pdfRule.replacement(content, node, {}); + } + return '\n\n'; + }; + } + const turndown = new TurndownService(turndownOpts); turndown.use(turndownPluginGfm); turndown.remove('script'); turndown.remove('style'); + const pdfRule = { + filter: ['embed', 'object'], + replacement: function(_content: string, node: any, _options: any) { + // We are setting embedded_pdf as name so that we can later distingish them from normal links and create resources for them. + if (node.matches('embed') && node.getAttribute('src') && pdfUrlRegex.test(node.getAttribute('src'))) { + return `[embedded_pdf](${node.getAttribute('src')})`; + } else if (node.matches('object') && node.getAttribute('data') && pdfUrlRegex.test(node.getAttribute('data'))) { + return `[embedded_pdf](${node.getAttribute('data')})`; + } + return ''; + }, + }; + if (options.convertEmbeddedPdfsToLinks) { + turndown.addRule('pdf', pdfRule); + } let md = turndown.turndown(html); if (options.baseUrl) md = markdownUtils.prependBaseUrl(md, options.baseUrl); return md; diff --git a/packages/lib/htmlUtils.ts b/packages/lib/htmlUtils.ts index fff72e3ec..c5bcf66a6 100644 --- a/packages/lib/htmlUtils.ts +++ b/packages/lib/htmlUtils.ts @@ -7,6 +7,9 @@ const { escapeHtml } = require('./string-utils.js'); // https://stackoverflow.com/a/16119722/561309 const imageRegex = //gi; const anchorRegex = //gi; +const embedRegex = //gi; +const objectRegex = //gi; +const pdfUrlRegex = /[\s\S]*?\.pdf$/i; const selfClosingElements = [ 'area', @@ -61,6 +64,11 @@ class HtmlUtils { return this.extractUrls(imageRegex, html); } + // Returns the **encoded** URLs, so to be useful they should be decoded again before use. + public extractPdfUrls(html: string) { + return [...this.extractUrls(embedRegex, html), ...this.extractUrls(objectRegex, html)].filter(url => pdfUrlRegex.test(url)); + } + // Returns the **encoded** URLs, so to be useful they should be decoded again before use. public extractAnchorUrls(html: string) { return this.extractUrls(anchorRegex, html); @@ -87,6 +95,27 @@ class HtmlUtils { }); } + public replaceEmbedUrls(html: string, callback: Function) { + if (!html) return ''; + // We are adding the link as since joplin disabled , tags due to security reasons. + // See: CVE-2020-15930 + html = html.replace(embedRegex, (_v: string, _before: string, src: string, _after: string) => { + const link = callback(src); + return `${escapeHtml(src)}`; + }); + html = html.replace(objectRegex, (_v: string, _before: string, src: string, _after: string) => { + const link = callback(src); + return `${escapeHtml(src)}`; + }); + return html; + } + + public replaceMediaUrls(html: string, callback: Function) { + html = this.replaceImageUrls(html, callback); + html = this.replaceEmbedUrls(html, callback); + return html; + } + // Note that the URLs provided by this function are URL-encoded, which is // usually what you want for web URLs. But if they are file:// URLs and the // file path is going to be used, it will need to be unescaped first. The diff --git a/packages/lib/markdownUtils.ts b/packages/lib/markdownUtils.ts index d13c9bee8..48cfafe2f 100644 --- a/packages/lib/markdownUtils.ts +++ b/packages/lib/markdownUtils.ts @@ -69,7 +69,7 @@ const markdownUtils = { }, // Returns the **encoded** URLs, so to be useful they should be decoded again before use. - extractFileUrls(md: string, onlyImage: boolean = false): Array { + extractFileUrls(md: string, onlyType: string = null): Array { const markdownIt = new MarkdownIt(); markdownIt.validateLink = validateLinks; // Necessary to support file:/// links @@ -77,10 +77,16 @@ const markdownUtils = { const tokens = markdownIt.parse(md, env); const output: string[] = []; + let linkType = onlyType; + if (linkType === 'pdf') linkType = 'link_open'; + const searchUrls = (tokens: any[]) => { for (let i = 0; i < tokens.length; i++) { const token = tokens[i]; - if ((onlyImage === true && token.type === 'image') || (onlyImage === false && (token.type === 'image' || token.type === 'link_open'))) { + if ((!onlyType && (token.type === 'link_open' || token.type === 'image')) || (!!onlyType && token.type === onlyType) || (onlyType == 'pdf' && token.type === 'link_open')) { + // Pdf embeds are a special case, they are represented as 'link_open' tokens but are marked with 'embedded_pdf' as link name by the parser + // We are making sure if its in the proper pdf link format, only then we add it to the list + if (onlyType === 'pdf' && !(tokens.length > i + 1 && tokens[i + 1].type === 'text' && tokens[i + 1].content === 'embedded_pdf')) continue; for (let j = 0; j < token.attrs.length; j++) { const a = token.attrs[j]; if ((a[0] === 'src' || a[0] === 'href') && a.length >= 2 && a[1]) { @@ -107,7 +113,11 @@ const markdownUtils = { }, extractImageUrls(md: string) { - return markdownUtils.extractFileUrls(md,true); + return markdownUtils.extractFileUrls(md, 'image'); + }, + + extractPdfUrls(md: string) { + return markdownUtils.extractFileUrls(md, 'pdf'); }, // The match results has 5 items diff --git a/packages/lib/markupLanguageUtils.ts b/packages/lib/markupLanguageUtils.ts index 55f2409c7..b68eb0513 100644 --- a/packages/lib/markupLanguageUtils.ts +++ b/packages/lib/markupLanguageUtils.ts @@ -28,6 +28,17 @@ export class MarkupLanguageUtils { return urls; } + public extractPdfUrls(language: MarkupLanguage, text: string): string[] { + let urls: string[] = []; + if (language === MarkupLanguage.Any) { + urls = urls.concat(this.lib_(MarkupLanguage.Markdown).extractPdfUrls(text)); + urls = urls.concat(this.lib_(MarkupLanguage.Html).extractPdfUrls(text)); + } else { + urls = this.lib_(language).extractPdfUrls(text); + } + return urls; + } + // Create a new MarkupToHtml instance while injecting options specific to Joplin // desktop and mobile applications. public newMarkupToHtml(_plugins: PluginStates = null, options: Options = null) { diff --git a/packages/lib/services/rest/Api.test.ts b/packages/lib/services/rest/Api.test.ts index b962f1df9..f56d76100 100644 --- a/packages/lib/services/rest/Api.test.ts +++ b/packages/lib/services/rest/Api.test.ts @@ -1,5 +1,6 @@ import { PaginationOrderDir } from '../../models/utils/types'; import Api, { RequestMethod } from '../../services/rest/Api'; +import { extractMediaUrls } from './routes/notes'; import shim from '../../shim'; import { setupDatabaseAndSynchronizer, switchClient, checkThrowAsync, db, msleep, supportDir } from '../../testing/test-utils'; import Folder from '../../models/Folder'; @@ -9,6 +10,7 @@ import Tag from '../../models/Tag'; import NoteTag from '../../models/NoteTag'; import ResourceService from '../../services/ResourceService'; import SearchEngine from '../../services/searchengine/SearchEngine'; +const { MarkupToHtml } = require('@joplin/renderer'); import { ResourceEntity } from '../database/types'; const createFolderForPagination = async (num: number, time: number) => { @@ -452,6 +454,47 @@ describe('services_rest_Api', function() { expect(response.body).toBe('**Bold text**'); })); + it('should extract media urls from body', (() => { + const tests = [ + { + language: MarkupToHtml.MARKUP_LANGUAGE_HTML, + body: '
', + result: ['https://example.com/img.png', 'https://example.com/sample.pdf', 'https://example.com/file.PDF'], + }, + { + language: MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN, + body: 'test text \n ![img 1](https://example.com/img1.png) [embedded_pdf](https://example.com/sample1.pdf) [embedded_pdf](https://example.com/file.PDF)', + result: ['https://example.com/img1.png', 'https://example.com/sample1.pdf', 'https://example.com/file.PDF'], + }, + { + language: MarkupToHtml.MARKUP_LANGUAGE_HTML, + body: '', + result: [], + }, + ]; + tests.forEach((test) => { + const urls = extractMediaUrls(test.language, test.body); + expect(urls).toEqual(test.result); + }); + })); + + it('should create notes with pdf embeds', (async () => { + let response = null; + const f = await Folder.save({ title: 'pdf test1' }); + + response = await api.route(RequestMethod.POST, 'notes', null, JSON.stringify({ + title: 'testing PDF embeds', + parent_id: f.id, + body_html: `
`, + })); + + const resources = await Resource.all(); + expect(resources.length).toBe(1); + + const resource = resources[0]; + expect(response.body.indexOf(resource.id) >= 0).toBe(true); + })); + it('should handle tokens', (async () => { api = new Api('mytoken'); diff --git a/packages/lib/services/rest/routes/notes.ts b/packages/lib/services/rest/routes/notes.ts index 426ad69c9..2a8f4def4 100644 --- a/packages/lib/services/rest/routes/notes.ts +++ b/packages/lib/services/rest/routes/notes.ts @@ -89,6 +89,7 @@ async function requestNoteToNote(requestNote: any) { output.body = await htmlToMdParser().parse(`
${requestNote.body_html}
`, { baseUrl: baseUrl, anchorNames: requestNote.anchor_names ? requestNote.anchor_names : [], + convertEmbeddedPdfsToLinks: true, }); output.markup_language = MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN; } @@ -143,19 +144,20 @@ async function buildNoteStyleSheet(stylesheets: any[]) { return output; } -async function tryToGuessImageExtFromMimeType(response: any, imagePath: string) { +async function tryToGuessExtFromMimeType(response: any, mediaPath: string) { const mimeType = mimeTypeFromHeaders(response.headers); - if (!mimeType) return imagePath; + if (!mimeType) return mediaPath; const newExt = mimeUtils.toFileExtension(mimeType); - if (!newExt) return imagePath; + if (!newExt) return mediaPath; - const newImagePath = `${imagePath}.${newExt}`; - await shim.fsDriver().move(imagePath, newImagePath); - return newImagePath; + const newMediaPath = `${mediaPath}.${newExt}`; + await shim.fsDriver().move(mediaPath, newMediaPath); + return newMediaPath; } -async function downloadImage(url: string /* , allowFileProtocolImages */) { +async function downloadMediaFile(url: string /* , allowFileProtocolImages */) { + const tempDir = Setting.value('tempDir'); // The URL we get to download have been extracted from the Markdown document @@ -163,6 +165,12 @@ async function downloadImage(url: string /* , allowFileProtocolImages */) { const isDataUrl = url && url.toLowerCase().indexOf('data:') === 0; + // PDFs and other heavy resoucres are often served as seperate files insted of data urls, its very unlikely to encounter a pdf as a data url + if (isDataUrl && !url.toLowerCase().startsWith('data:image/')) { + reg.logger().warn(`Resources in data URL format is only supported for images ${url}`); + return ''; + } + const name = isDataUrl ? md5(`${Math.random()}_${Date.now()}`) : filename(url); let fileExt = isDataUrl ? mimeUtils.toFileExtension(mimeUtils.fromDataUrl(url)) : safeFileExtension(fileExtension(url).toLowerCase()); if (!mimeUtils.fromFileExtension(fileExt)) fileExt = ''; // If the file extension is unknown - clear it. @@ -170,38 +178,38 @@ async function downloadImage(url: string /* , allowFileProtocolImages */) { // Append a UUID because simply checking if the file exists is not enough since // multiple resources can be downloaded at the same time (race condition). - let imagePath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`; + let mediaPath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`; try { if (isDataUrl) { - await shim.imageFromDataUrl(url, imagePath); + await shim.imageFromDataUrl(url, mediaPath); } else if (urlUtils.urlProtocol(url).toLowerCase() === 'file:') { // Can't think of any reason to disallow this at this point // if (!allowFileProtocolImages) throw new Error('For security reasons, this URL with file:// protocol cannot be downloaded'); const localPath = fileUriToPath(url); - await shim.fsDriver().copy(localPath, imagePath); + await shim.fsDriver().copy(localPath, mediaPath); } else { - const response = await shim.fetchBlob(url, { path: imagePath, maxRetry: 1 }); + const response = await shim.fetchBlob(url, { path: mediaPath, maxRetry: 1 }); // If we could not find the file extension from the URL, try to get it // now based on the Content-Type header. - if (!fileExt) imagePath = await tryToGuessImageExtFromMimeType(response, imagePath); + if (!fileExt) mediaPath = await tryToGuessExtFromMimeType(response, mediaPath); } - return imagePath; + return mediaPath; } catch (error) { reg.logger().warn(`Cannot download image at ${url}`, error); return ''; } } -async function downloadImages(urls: string[] /* , allowFileProtocolImages:boolean */) { +async function downloadMediaFiles(urls: string[] /* , allowFileProtocolImages:boolean */) { const PromisePool = require('es6-promise-pool'); const output: any = {}; const downloadOne = async (url: string) => { - const imagePath = await downloadImage(url); // , allowFileProtocolImages); - if (imagePath) output[url] = { path: imagePath, originalUrl: url }; + const mediaPath = await downloadMediaFile(url); // , allowFileProtocolImages); + if (mediaPath) output[url] = { path: mediaPath, originalUrl: url }; }; let urlIndex = 0; @@ -245,27 +253,38 @@ async function removeTempFiles(urls: string[]) { } } -function replaceImageUrlsByResources(markupLanguage: number, md: string, urls: any, imageSizes: any) { +function replaceUrlsByResources(markupLanguage: number, md: string, urls: any, imageSizes: any) { const imageSizesIndexes: any = {}; if (markupLanguage === MarkupToHtml.MARKUP_LANGUAGE_HTML) { - return htmlUtils.replaceImageUrls(md, (imageUrl: string) => { - const urlInfo: any = urls[imageUrl]; - if (!urlInfo || !urlInfo.resource) return imageUrl; + return htmlUtils.replaceMediaUrls(md, (url: string) => { + const urlInfo: any = urls[url]; + if (!urlInfo || !urlInfo.resource) return url; return Resource.internalUrl(urlInfo.resource); }); } else { // eslint-disable-next-line no-useless-escape - return md.replace(/(!\[.*?\]\()([^\s\)]+)(.*?\))/g, (_match: any, before: string, imageUrl: string, after: string) => { - const urlInfo = urls[imageUrl]; - if (!urlInfo || !urlInfo.resource) return before + imageUrl + after; - if (!(urlInfo.originalUrl in imageSizesIndexes)) imageSizesIndexes[urlInfo.originalUrl] = 0; + return md.replace(/(!?\[.*?\]\()([^\s\)]+)(.*?\))/g, (_match: any, before: string, url: string, after: string) => { + let type = 'link'; + if (before.startsWith('[embedded_pdf]')) { + type = 'pdf'; + } else if (before.startsWith('![')) { + type = 'image'; + } + + const urlInfo = urls[url]; + if (type === 'link' || !urlInfo || !urlInfo.resource) return before + url + after; const resourceUrl = Resource.internalUrl(urlInfo.resource); - const imageSizesCollection = imageSizes[urlInfo.originalUrl]; + if (type === 'pdf') { + return `[${markdownUtils.escapeLinkUrl(url)}](${resourceUrl}${after}`; + } + if (!(urlInfo.originalUrl in imageSizesIndexes)) imageSizesIndexes[urlInfo.originalUrl] = 0; + const imageSizesCollection = imageSizes[urlInfo.originalUrl]; if (!imageSizesCollection) { - // In some cases, we won't find the image size information for that particular URL. Normally + // Either its not an image or we don't know the size of the image + // In some cases, we won't find the image size information for that particular image URL. Normally // it will only happen when using the "Clip simplified page" feature, which can modify the // image URLs (for example it will select a smaller size resolution). In that case, it's // fine to return the image as-is because it has already good dimensions. @@ -284,6 +303,13 @@ function replaceImageUrlsByResources(markupLanguage: number, md: string, urls: a } } +export function extractMediaUrls(markupLanguage: number, text: string): string[] { + const urls: string[] = []; + urls.push(...ArrayUtils.unique(markupLanguageUtils.extractImageUrls(markupLanguage, text))); + urls.push(...ArrayUtils.unique(markupLanguageUtils.extractPdfUrls(markupLanguage, text))); + return urls; +} + // Note must have been saved first async function attachImageFromDataUrl(note: any, imageDataUrl: string, cropRect: any) { const tempDir = Setting.value('tempDir'); @@ -328,17 +354,17 @@ export default async function(request: Request, id: string = null, link: string let note: any = await requestNoteToNote(requestNote); - const imageUrls = ArrayUtils.unique(markupLanguageUtils.extractImageUrls(note.markup_language, note.body)); + const mediaUrls = extractMediaUrls(note.markup_language, note.body); - reg.logger().info(`Request (${requestId}): Downloading images: ${imageUrls.length}`); + reg.logger().info(`Request (${requestId}): Downloading media files: ${mediaUrls.length}`); - let result = await downloadImages(imageUrls); // , allowFileProtocolImages); + let result = await downloadMediaFiles(mediaUrls); // , allowFileProtocolImages); reg.logger().info(`Request (${requestId}): Creating resources from paths: ${Object.getOwnPropertyNames(result).length}`); result = await createResourcesFromPaths(result); await removeTempFiles(result); - note.body = replaceImageUrlsByResources(note.markup_language, note.body, result, imageSizes); + note.body = replaceUrlsByResources(note.markup_language, note.body, result, imageSizes); reg.logger().info(`Request (${requestId}): Saving note...`);