From c0bc4c38c341410f78ddff3417402cda6489faa5 Mon Sep 17 00:00:00 2001
From: asrient <44570278+asrient@users.noreply.github.com>
Date: Mon, 20 Jun 2022 18:26:54 +0530
Subject: [PATCH] Clipper: Resolves #6247: Clipper unable to pull and store
PDFs (#6384)
---
packages/app-clipper/content_scripts/index.js | 34 ++++++++
packages/lib/HtmlToMd.ts | 33 ++++++-
packages/lib/htmlUtils.ts | 29 +++++++
packages/lib/markdownUtils.ts | 16 +++-
packages/lib/markupLanguageUtils.ts | 11 +++
packages/lib/services/rest/Api.test.ts | 43 ++++++++++
packages/lib/services/rest/routes/notes.ts | 86 ++++++++++++-------
7 files changed, 217 insertions(+), 35 deletions(-)
diff --git a/packages/app-clipper/content_scripts/index.js b/packages/app-clipper/content_scripts/index.js
index 052b9130e..3ccc01b1f 100644
--- a/packages/app-clipper/content_scripts/index.js
+++ b/packages/app-clipper/content_scripts/index.js
@@ -32,6 +32,15 @@
}
}
+ function escapeHtml(s) {
+ return s
+ .replace(/&/g, '&')
+ .replace(//g, '>')
+ .replace(/"/g, '"')
+ .replace(/'/g, ''');
+ }
+
function pageTitle() {
const titleElements = document.getElementsByTagName('title');
if (titleElements.length) return titleElements[0].text.trim();
@@ -204,6 +213,16 @@
}
}
+ if (nodeName === 'embed') {
+ const src = absoluteUrl(node.src);
+ node.setAttribute('src', src);
+ }
+
+ if (nodeName === 'object') {
+ const data = absoluteUrl(node.data);
+ node.setAttribute('data', data);
+ }
+
cleanUpElement(convertToMarkup, node, imageSizes, imageIndexes);
}
}
@@ -317,6 +336,9 @@
}
function readabilityProcess() {
+
+ if (isPagePdf()) throw new Error('Could not parse PDF document with Readability');
+
// eslint-disable-next-line no-undef
const readability = new Readability(documentForReadability());
const article = readability.parse();
@@ -329,6 +351,14 @@
};
}
+ function isPagePdf() {
+ return document.contentType == 'application/pdf';
+ }
+
+ function embedPageUrl() {
+ return ``;
+ }
+
async function prepareCommandResponse(command) {
console.info(`Got command: ${command.name}`);
const shouldSendToJoplin = !!command.shouldSendToJoplin;
@@ -375,6 +405,10 @@
} else if (command.name === 'completePageHtml') {
+ if (isPagePdf()) {
+ return clippedContentResponse(pageTitle(), embedPageUrl(), getImageSizes(document), getAnchorNames(document));
+ }
+
hardcodePreStyles(document);
addSvgClass(document);
preProcessDocument(document);
diff --git a/packages/lib/HtmlToMd.ts b/packages/lib/HtmlToMd.ts
index e38f0d090..0c7361aef 100644
--- a/packages/lib/HtmlToMd.ts
+++ b/packages/lib/HtmlToMd.ts
@@ -2,17 +2,20 @@ const TurndownService = require('@joplin/turndown');
const turndownPluginGfm = require('@joplin/turndown-plugin-gfm').gfm;
import markdownUtils from './markdownUtils';
+const pdfUrlRegex = /[\s\S]*?\.pdf$/i;
+
export interface ParseOptions {
anchorNames?: string[];
preserveImageTagsWithSize?: boolean;
baseUrl?: string;
disableEscapeContent?: boolean;
+ convertEmbeddedPdfsToLinks?: boolean;
}
export default class HtmlToMd {
public parse(html: string, options: ParseOptions = {}) {
- const turndown = new TurndownService({
+ const turndownOpts: any = {
headingStyle: 'atx',
anchorNames: options.anchorNames ? options.anchorNames.map(n => n.trim().toLowerCase()) : [],
codeBlockStyle: 'fenced',
@@ -22,10 +25,36 @@ export default class HtmlToMd {
strongDelimiter: '**',
br: '',
disableEscapeContent: 'disableEscapeContent' in options ? options.disableEscapeContent : false,
- });
+ };
+ if (options.convertEmbeddedPdfsToLinks) {
+ // Turndown ignores empty