1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-24 10:27:10 +02:00

Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384)

This commit is contained in:
asrient 2022-06-20 18:26:54 +05:30 committed by GitHub
parent 0c50a5ab9b
commit c0bc4c38c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 217 additions and 35 deletions

View File

@ -32,6 +32,15 @@
}
}
function escapeHtml(s) {
return s
.replace(/&/g, '&')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&#039;');
}
function pageTitle() {
const titleElements = document.getElementsByTagName('title');
if (titleElements.length) return titleElements[0].text.trim();
@ -204,6 +213,16 @@
}
}
if (nodeName === 'embed') {
const src = absoluteUrl(node.src);
node.setAttribute('src', src);
}
if (nodeName === 'object') {
const data = absoluteUrl(node.data);
node.setAttribute('data', data);
}
cleanUpElement(convertToMarkup, node, imageSizes, imageIndexes);
}
}
@ -317,6 +336,9 @@
}
function readabilityProcess() {
if (isPagePdf()) throw new Error('Could not parse PDF document with Readability');
// eslint-disable-next-line no-undef
const readability = new Readability(documentForReadability());
const article = readability.parse();
@ -329,6 +351,14 @@
};
}
function isPagePdf() {
return document.contentType == 'application/pdf';
}
function embedPageUrl() {
return `<embed src="${escapeHtml(window.location.href)}" type="${escapeHtml(document.contentType)}" />`;
}
async function prepareCommandResponse(command) {
console.info(`Got command: ${command.name}`);
const shouldSendToJoplin = !!command.shouldSendToJoplin;
@ -375,6 +405,10 @@
} else if (command.name === 'completePageHtml') {
if (isPagePdf()) {
return clippedContentResponse(pageTitle(), embedPageUrl(), getImageSizes(document), getAnchorNames(document));
}
hardcodePreStyles(document);
addSvgClass(document);
preProcessDocument(document);

View File

@ -2,17 +2,20 @@ const TurndownService = require('@joplin/turndown');
const turndownPluginGfm = require('@joplin/turndown-plugin-gfm').gfm;
import markdownUtils from './markdownUtils';
const pdfUrlRegex = /[\s\S]*?\.pdf$/i;
export interface ParseOptions {
anchorNames?: string[];
preserveImageTagsWithSize?: boolean;
baseUrl?: string;
disableEscapeContent?: boolean;
convertEmbeddedPdfsToLinks?: boolean;
}
export default class HtmlToMd {
public parse(html: string, options: ParseOptions = {}) {
const turndown = new TurndownService({
const turndownOpts: any = {
headingStyle: 'atx',
anchorNames: options.anchorNames ? options.anchorNames.map(n => n.trim().toLowerCase()) : [],
codeBlockStyle: 'fenced',
@ -22,10 +25,36 @@ export default class HtmlToMd {
strongDelimiter: '**',
br: '',
disableEscapeContent: 'disableEscapeContent' in options ? options.disableEscapeContent : false,
});
};
if (options.convertEmbeddedPdfsToLinks) {
// Turndown ignores empty <object> tags, so we need to handle this case seperately
// https://github.com/mixmark-io/turndown/issues/293#issuecomment-588984202
turndownOpts.blankReplacement = (content: string, node: any) => {
if (node.matches('object')) {
return pdfRule.replacement(content, node, {});
}
return '\n\n';
};
}
const turndown = new TurndownService(turndownOpts);
turndown.use(turndownPluginGfm);
turndown.remove('script');
turndown.remove('style');
const pdfRule = {
filter: ['embed', 'object'],
replacement: function(_content: string, node: any, _options: any) {
// We are setting embedded_pdf as name so that we can later distingish them from normal links and create resources for them.
if (node.matches('embed') && node.getAttribute('src') && pdfUrlRegex.test(node.getAttribute('src'))) {
return `[embedded_pdf](${node.getAttribute('src')})`;
} else if (node.matches('object') && node.getAttribute('data') && pdfUrlRegex.test(node.getAttribute('data'))) {
return `[embedded_pdf](${node.getAttribute('data')})`;
}
return '';
},
};
if (options.convertEmbeddedPdfsToLinks) {
turndown.addRule('pdf', pdfRule);
}
let md = turndown.turndown(html);
if (options.baseUrl) md = markdownUtils.prependBaseUrl(md, options.baseUrl);
return md;

View File

@ -7,6 +7,9 @@ const { escapeHtml } = require('./string-utils.js');
// https://stackoverflow.com/a/16119722/561309
const imageRegex = /<img([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const anchorRegex = /<a([\s\S]*?)href=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const embedRegex = /<embed([\s\S]*?)src=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const objectRegex = /<object([\s\S]*?)data=["']([\s\S]*?)["']([\s\S]*?)>/gi;
const pdfUrlRegex = /[\s\S]*?\.pdf$/i;
const selfClosingElements = [
'area',
@ -61,6 +64,11 @@ class HtmlUtils {
return this.extractUrls(imageRegex, html);
}
// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
public extractPdfUrls(html: string) {
return [...this.extractUrls(embedRegex, html), ...this.extractUrls(objectRegex, html)].filter(url => pdfUrlRegex.test(url));
}
// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
public extractAnchorUrls(html: string) {
return this.extractUrls(anchorRegex, html);
@ -87,6 +95,27 @@ class HtmlUtils {
});
}
public replaceEmbedUrls(html: string, callback: Function) {
if (!html) return '';
// We are adding the link as <a> since joplin disabled <embed>, <object> tags due to security reasons.
// See: CVE-2020-15930
html = html.replace(embedRegex, (_v: string, _before: string, src: string, _after: string) => {
const link = callback(src);
return `<a href="${link}">${escapeHtml(src)}</a>`;
});
html = html.replace(objectRegex, (_v: string, _before: string, src: string, _after: string) => {
const link = callback(src);
return `<a href="${link}">${escapeHtml(src)}</a>`;
});
return html;
}
public replaceMediaUrls(html: string, callback: Function) {
html = this.replaceImageUrls(html, callback);
html = this.replaceEmbedUrls(html, callback);
return html;
}
// Note that the URLs provided by this function are URL-encoded, which is
// usually what you want for web URLs. But if they are file:// URLs and the
// file path is going to be used, it will need to be unescaped first. The

View File

@ -69,7 +69,7 @@ const markdownUtils = {
},
// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
extractFileUrls(md: string, onlyImage: boolean = false): Array<string> {
extractFileUrls(md: string, onlyType: string = null): Array<string> {
const markdownIt = new MarkdownIt();
markdownIt.validateLink = validateLinks; // Necessary to support file:/// links
@ -77,10 +77,16 @@ const markdownUtils = {
const tokens = markdownIt.parse(md, env);
const output: string[] = [];
let linkType = onlyType;
if (linkType === 'pdf') linkType = 'link_open';
const searchUrls = (tokens: any[]) => {
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i];
if ((onlyImage === true && token.type === 'image') || (onlyImage === false && (token.type === 'image' || token.type === 'link_open'))) {
if ((!onlyType && (token.type === 'link_open' || token.type === 'image')) || (!!onlyType && token.type === onlyType) || (onlyType == 'pdf' && token.type === 'link_open')) {
// Pdf embeds are a special case, they are represented as 'link_open' tokens but are marked with 'embedded_pdf' as link name by the parser
// We are making sure if its in the proper pdf link format, only then we add it to the list
if (onlyType === 'pdf' && !(tokens.length > i + 1 && tokens[i + 1].type === 'text' && tokens[i + 1].content === 'embedded_pdf')) continue;
for (let j = 0; j < token.attrs.length; j++) {
const a = token.attrs[j];
if ((a[0] === 'src' || a[0] === 'href') && a.length >= 2 && a[1]) {
@ -107,7 +113,11 @@ const markdownUtils = {
},
extractImageUrls(md: string) {
return markdownUtils.extractFileUrls(md,true);
return markdownUtils.extractFileUrls(md, 'image');
},
extractPdfUrls(md: string) {
return markdownUtils.extractFileUrls(md, 'pdf');
},
// The match results has 5 items

View File

@ -28,6 +28,17 @@ export class MarkupLanguageUtils {
return urls;
}
public extractPdfUrls(language: MarkupLanguage, text: string): string[] {
let urls: string[] = [];
if (language === MarkupLanguage.Any) {
urls = urls.concat(this.lib_(MarkupLanguage.Markdown).extractPdfUrls(text));
urls = urls.concat(this.lib_(MarkupLanguage.Html).extractPdfUrls(text));
} else {
urls = this.lib_(language).extractPdfUrls(text);
}
return urls;
}
// Create a new MarkupToHtml instance while injecting options specific to Joplin
// desktop and mobile applications.
public newMarkupToHtml(_plugins: PluginStates = null, options: Options = null) {

View File

@ -1,5 +1,6 @@
import { PaginationOrderDir } from '../../models/utils/types';
import Api, { RequestMethod } from '../../services/rest/Api';
import { extractMediaUrls } from './routes/notes';
import shim from '../../shim';
import { setupDatabaseAndSynchronizer, switchClient, checkThrowAsync, db, msleep, supportDir } from '../../testing/test-utils';
import Folder from '../../models/Folder';
@ -9,6 +10,7 @@ import Tag from '../../models/Tag';
import NoteTag from '../../models/NoteTag';
import ResourceService from '../../services/ResourceService';
import SearchEngine from '../../services/searchengine/SearchEngine';
const { MarkupToHtml } = require('@joplin/renderer');
import { ResourceEntity } from '../database/types';
const createFolderForPagination = async (num: number, time: number) => {
@ -452,6 +454,47 @@ describe('services_rest_Api', function() {
expect(response.body).toBe('**Bold text**');
}));
it('should extract media urls from body', (() => {
const tests = [
{
language: MarkupToHtml.MARKUP_LANGUAGE_HTML,
body: '<div> <img src="https://example.com/img.png" /> <embed src="https://example.com/sample.pdf"/> <object data="https://example.com/file.PDF"></object> </div>',
result: ['https://example.com/img.png', 'https://example.com/sample.pdf', 'https://example.com/file.PDF'],
},
{
language: MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN,
body: 'test text \n ![img 1](https://example.com/img1.png) [embedded_pdf](https://example.com/sample1.pdf) [embedded_pdf](https://example.com/file.PDF)',
result: ['https://example.com/img1.png', 'https://example.com/sample1.pdf', 'https://example.com/file.PDF'],
},
{
language: MarkupToHtml.MARKUP_LANGUAGE_HTML,
body: '<div> <embed src="https://example.com/sample"/> <embed /> <object data="https://example.com/file.pdfff"></object> <a href="https://test.com/file.pdf">Link</a> </div>',
result: [],
},
];
tests.forEach((test) => {
const urls = extractMediaUrls(test.language, test.body);
expect(urls).toEqual(test.result);
});
}));
it('should create notes with pdf embeds', (async () => {
let response = null;
const f = await Folder.save({ title: 'pdf test1' });
response = await api.route(RequestMethod.POST, 'notes', null, JSON.stringify({
title: 'testing PDF embeds',
parent_id: f.id,
body_html: `<div> <embed src="file://${supportDir}/welcome.pdf" type="application/pdf" /> </div>`,
}));
const resources = await Resource.all();
expect(resources.length).toBe(1);
const resource = resources[0];
expect(response.body.indexOf(resource.id) >= 0).toBe(true);
}));
it('should handle tokens', (async () => {
api = new Api('mytoken');

View File

@ -89,6 +89,7 @@ async function requestNoteToNote(requestNote: any) {
output.body = await htmlToMdParser().parse(`<div>${requestNote.body_html}</div>`, {
baseUrl: baseUrl,
anchorNames: requestNote.anchor_names ? requestNote.anchor_names : [],
convertEmbeddedPdfsToLinks: true,
});
output.markup_language = MarkupToHtml.MARKUP_LANGUAGE_MARKDOWN;
}
@ -143,19 +144,20 @@ async function buildNoteStyleSheet(stylesheets: any[]) {
return output;
}
async function tryToGuessImageExtFromMimeType(response: any, imagePath: string) {
async function tryToGuessExtFromMimeType(response: any, mediaPath: string) {
const mimeType = mimeTypeFromHeaders(response.headers);
if (!mimeType) return imagePath;
if (!mimeType) return mediaPath;
const newExt = mimeUtils.toFileExtension(mimeType);
if (!newExt) return imagePath;
if (!newExt) return mediaPath;
const newImagePath = `${imagePath}.${newExt}`;
await shim.fsDriver().move(imagePath, newImagePath);
return newImagePath;
const newMediaPath = `${mediaPath}.${newExt}`;
await shim.fsDriver().move(mediaPath, newMediaPath);
return newMediaPath;
}
async function downloadImage(url: string /* , allowFileProtocolImages */) {
async function downloadMediaFile(url: string /* , allowFileProtocolImages */) {
const tempDir = Setting.value('tempDir');
// The URL we get to download have been extracted from the Markdown document
@ -163,6 +165,12 @@ async function downloadImage(url: string /* , allowFileProtocolImages */) {
const isDataUrl = url && url.toLowerCase().indexOf('data:') === 0;
// PDFs and other heavy resoucres are often served as seperate files insted of data urls, its very unlikely to encounter a pdf as a data url
if (isDataUrl && !url.toLowerCase().startsWith('data:image/')) {
reg.logger().warn(`Resources in data URL format is only supported for images ${url}`);
return '';
}
const name = isDataUrl ? md5(`${Math.random()}_${Date.now()}`) : filename(url);
let fileExt = isDataUrl ? mimeUtils.toFileExtension(mimeUtils.fromDataUrl(url)) : safeFileExtension(fileExtension(url).toLowerCase());
if (!mimeUtils.fromFileExtension(fileExt)) fileExt = ''; // If the file extension is unknown - clear it.
@ -170,38 +178,38 @@ async function downloadImage(url: string /* , allowFileProtocolImages */) {
// Append a UUID because simply checking if the file exists is not enough since
// multiple resources can be downloaded at the same time (race condition).
let imagePath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`;
let mediaPath = `${tempDir}/${safeFilename(name)}_${uuid.create()}${fileExt}`;
try {
if (isDataUrl) {
await shim.imageFromDataUrl(url, imagePath);
await shim.imageFromDataUrl(url, mediaPath);
} else if (urlUtils.urlProtocol(url).toLowerCase() === 'file:') {
// Can't think of any reason to disallow this at this point
// if (!allowFileProtocolImages) throw new Error('For security reasons, this URL with file:// protocol cannot be downloaded');
const localPath = fileUriToPath(url);
await shim.fsDriver().copy(localPath, imagePath);
await shim.fsDriver().copy(localPath, mediaPath);
} else {
const response = await shim.fetchBlob(url, { path: imagePath, maxRetry: 1 });
const response = await shim.fetchBlob(url, { path: mediaPath, maxRetry: 1 });
// If we could not find the file extension from the URL, try to get it
// now based on the Content-Type header.
if (!fileExt) imagePath = await tryToGuessImageExtFromMimeType(response, imagePath);
if (!fileExt) mediaPath = await tryToGuessExtFromMimeType(response, mediaPath);
}
return imagePath;
return mediaPath;
} catch (error) {
reg.logger().warn(`Cannot download image at ${url}`, error);
return '';
}
}
async function downloadImages(urls: string[] /* , allowFileProtocolImages:boolean */) {
async function downloadMediaFiles(urls: string[] /* , allowFileProtocolImages:boolean */) {
const PromisePool = require('es6-promise-pool');
const output: any = {};
const downloadOne = async (url: string) => {
const imagePath = await downloadImage(url); // , allowFileProtocolImages);
if (imagePath) output[url] = { path: imagePath, originalUrl: url };
const mediaPath = await downloadMediaFile(url); // , allowFileProtocolImages);
if (mediaPath) output[url] = { path: mediaPath, originalUrl: url };
};
let urlIndex = 0;
@ -245,27 +253,38 @@ async function removeTempFiles(urls: string[]) {
}
}
function replaceImageUrlsByResources(markupLanguage: number, md: string, urls: any, imageSizes: any) {
function replaceUrlsByResources(markupLanguage: number, md: string, urls: any, imageSizes: any) {
const imageSizesIndexes: any = {};
if (markupLanguage === MarkupToHtml.MARKUP_LANGUAGE_HTML) {
return htmlUtils.replaceImageUrls(md, (imageUrl: string) => {
const urlInfo: any = urls[imageUrl];
if (!urlInfo || !urlInfo.resource) return imageUrl;
return htmlUtils.replaceMediaUrls(md, (url: string) => {
const urlInfo: any = urls[url];
if (!urlInfo || !urlInfo.resource) return url;
return Resource.internalUrl(urlInfo.resource);
});
} else {
// eslint-disable-next-line no-useless-escape
return md.replace(/(!\[.*?\]\()([^\s\)]+)(.*?\))/g, (_match: any, before: string, imageUrl: string, after: string) => {
const urlInfo = urls[imageUrl];
if (!urlInfo || !urlInfo.resource) return before + imageUrl + after;
if (!(urlInfo.originalUrl in imageSizesIndexes)) imageSizesIndexes[urlInfo.originalUrl] = 0;
return md.replace(/(!?\[.*?\]\()([^\s\)]+)(.*?\))/g, (_match: any, before: string, url: string, after: string) => {
let type = 'link';
if (before.startsWith('[embedded_pdf]')) {
type = 'pdf';
} else if (before.startsWith('![')) {
type = 'image';
}
const urlInfo = urls[url];
if (type === 'link' || !urlInfo || !urlInfo.resource) return before + url + after;
const resourceUrl = Resource.internalUrl(urlInfo.resource);
const imageSizesCollection = imageSizes[urlInfo.originalUrl];
if (type === 'pdf') {
return `[${markdownUtils.escapeLinkUrl(url)}](${resourceUrl}${after}`;
}
if (!(urlInfo.originalUrl in imageSizesIndexes)) imageSizesIndexes[urlInfo.originalUrl] = 0;
const imageSizesCollection = imageSizes[urlInfo.originalUrl];
if (!imageSizesCollection) {
// In some cases, we won't find the image size information for that particular URL. Normally
// Either its not an image or we don't know the size of the image
// In some cases, we won't find the image size information for that particular image URL. Normally
// it will only happen when using the "Clip simplified page" feature, which can modify the
// image URLs (for example it will select a smaller size resolution). In that case, it's
// fine to return the image as-is because it has already good dimensions.
@ -284,6 +303,13 @@ function replaceImageUrlsByResources(markupLanguage: number, md: string, urls: a
}
}
export function extractMediaUrls(markupLanguage: number, text: string): string[] {
const urls: string[] = [];
urls.push(...ArrayUtils.unique(markupLanguageUtils.extractImageUrls(markupLanguage, text)));
urls.push(...ArrayUtils.unique(markupLanguageUtils.extractPdfUrls(markupLanguage, text)));
return urls;
}
// Note must have been saved first
async function attachImageFromDataUrl(note: any, imageDataUrl: string, cropRect: any) {
const tempDir = Setting.value('tempDir');
@ -328,17 +354,17 @@ export default async function(request: Request, id: string = null, link: string
let note: any = await requestNoteToNote(requestNote);
const imageUrls = ArrayUtils.unique(markupLanguageUtils.extractImageUrls(note.markup_language, note.body));
const mediaUrls = extractMediaUrls(note.markup_language, note.body);
reg.logger().info(`Request (${requestId}): Downloading images: ${imageUrls.length}`);
reg.logger().info(`Request (${requestId}): Downloading media files: ${mediaUrls.length}`);
let result = await downloadImages(imageUrls); // , allowFileProtocolImages);
let result = await downloadMediaFiles(mediaUrls); // , allowFileProtocolImages);
reg.logger().info(`Request (${requestId}): Creating resources from paths: ${Object.getOwnPropertyNames(result).length}`);
result = await createResourcesFromPaths(result);
await removeTempFiles(result);
note.body = replaceImageUrlsByResources(note.markup_language, note.body, result, imageSizes);
note.body = replaceUrlsByResources(note.markup_language, note.body, result, imageSizes);
reg.logger().info(`Request (${requestId}): Saving note...`);