Clipper: Resolves #6247: Clipper unable to pull and store PDFs (#6384)

2025-11-26 22:41:17 +02:00 · 2022-06-20 18:26:54 +05:30
parent 0c50a5ab9b
commit c0bc4c38c3
7 changed files with 217 additions and 35 deletions
--- a/packages/lib/markdownUtils.ts
+++ b/packages/lib/markdownUtils.ts
@@ -69,7 +69,7 @@ const markdownUtils = {
 	},

 	// Returns the **encoded** URLs, so to be useful they should be decoded again before use.
-	extractFileUrls(md: string, onlyImage: boolean = false): Array<string> {
+	extractFileUrls(md: string, onlyType: string = null): Array<string> {
 		const markdownIt = new MarkdownIt();
 		markdownIt.validateLink = validateLinks; // Necessary to support file:/// links

@@ -77,10 +77,16 @@ const markdownUtils = {
 		const tokens = markdownIt.parse(md, env);
 		const output: string[] = [];

+		let linkType = onlyType;
+		if (linkType === 'pdf') linkType = 'link_open';
+
 		const searchUrls = (tokens: any[]) => {
 			for (let i = 0; i < tokens.length; i++) {
 				const token = tokens[i];
-				if ((onlyImage === true && token.type === 'image') || (onlyImage === false && (token.type === 'image' || token.type === 'link_open'))) {
+				if ((!onlyType && (token.type === 'link_open' || token.type === 'image')) || (!!onlyType && token.type === onlyType) || (onlyType == 'pdf' && token.type === 'link_open')) {
+					// Pdf embeds are a special case, they are represented as 'link_open' tokens but are marked with 'embedded_pdf' as link name by the parser
+					// We are making sure if its in the proper pdf link format, only then we add it to the list
+					if (onlyType === 'pdf' && !(tokens.length > i + 1 && tokens[i + 1].type === 'text' && tokens[i + 1].content === 'embedded_pdf')) continue;
 					for (let j = 0; j < token.attrs.length; j++) {
 						const a = token.attrs[j];
 						if ((a[0] === 'src' || a[0] === 'href') && a.length >= 2 && a[1]) {
@@ -107,7 +113,11 @@ const markdownUtils = {
 	},

 	extractImageUrls(md: string) {
-		return markdownUtils.extractFileUrls(md,true);
+		return markdownUtils.extractFileUrls(md, 'image');
+	},
+
+	extractPdfUrls(md: string) {
+		return markdownUtils.extractFileUrls(md, 'pdf');
 	},

 	// The match results has 5 items