1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-11-27 08:21:03 +02:00

Clipper: Download images and convert them to resources

This commit is contained in:
Laurent Cozic 2018-05-23 12:14:38 +01:00
parent 7cf267254f
commit 3c5eb99c59
15 changed files with 1355 additions and 1158 deletions

View File

@ -66,26 +66,6 @@ process.stdout.on('error', function( err ) {
}
});
// async function main() {
// const InteropService = require('lib/services/InteropService');
// const service = new InteropService();
// console.info(service.moduleByFormat('importer', 'enex'));
// //await service.modules();
// }
// main().catch((error) => { console.error(error); });
application.start(process.argv).catch((error) => {
if (error.code == 'flagError') {
console.error(error.message);

View File

@ -13,11 +13,15 @@ if [[ $TEST_FILE != "" ]]; then
exit
fi
(cd "$ROOT_DIR" && npm test tests-build/synchronizer.js)
(cd "$ROOT_DIR" && npm test tests-build/encryption.js
(cd "$ROOT_DIR" && npm test tests-build/ArrayUtils.js)
(cd "$ROOT_DIR" && npm test tests-build/models_Setting.js)
(cd "$ROOT_DIR" && npm test tests-build/models_Note.js)
(cd "$ROOT_DIR" && npm test tests-build/encryption.js
(cd "$ROOT_DIR" && npm test tests-build/EnexToMd.js)
(cd "$ROOT_DIR" && npm test tests-build/HtmlToMd.js)
(cd "$ROOT_DIR" && npm test tests-build/markdownUtils.js)
(cd "$ROOT_DIR" && npm test tests-build/models_Folder.js)
(cd "$ROOT_DIR" && npm test tests-build/models_Note.js)
(cd "$ROOT_DIR" && npm test tests-build/models_Setting.js)
(cd "$ROOT_DIR" && npm test tests-build/services_InteropService.js)
(cd "$ROOT_DIR" && npm test tests-build/EnexToMd.js)
(cd "$ROOT_DIR" && npm test tests-build/services_ResourceService.js)
(cd "$ROOT_DIR" && npm test tests-build/synchronizer.js)
(cd "$ROOT_DIR" && npm test tests-build/urlUtils.js)

View File

@ -0,0 +1,37 @@
require('app-module-path').addPath(__dirname);
const { time } = require('lib/time-utils.js');
const { fileContentEqual, setupDatabase, setupDatabaseAndSynchronizer, db, synchronizer, fileApi, sleep, clearDatabase, switchClient, syncTargetId, objectsEqual, checkThrowAsync } = require('test-utils.js');
const markdownUtils = require('lib/markdownUtils.js');
process.on('unhandledRejection', (reason, p) => {
console.log('Unhandled Rejection at: Promise', p, 'reason:', reason);
});
describe('markdownUtils', function() {
beforeEach(async (done) => {
done();
});
it('should prepend a base URL', async (done) => {
const baseUrl = 'https://test.com/site';
const testCases = [
['[something](testing.html)', '[something](https://test.com/site/testing.html)'],
['![something](/img/test.png)', '![something](https://test.com/img/test.png)'],
['[![something](/img/test.png)](/index.html "Home page")', '[![something](https://test.com/img/test.png)](https://test.com/index.html "Home page")'],
['[onelink.com](/jmp/?id=123&u=http://something.com/test)', '[onelink.com](https://test.com/jmp/?id=123&u=http://something.com/test)'],
['[![some text](/img/test.png)](/jmp/?s=80&l=related&u=http://example.com "some decription")', '[![some text](https://test.com/img/test.png)](https://test.com/jmp/?s=80&l=related&u=http://example.com "some decription")'],
];
for (let i = 0; i < testCases.length; i++) {
const md = testCases[i][0];
const expected = testCases[i][1];
expect(markdownUtils.prependBaseUrl(md, baseUrl)).toBe(expected);
}
done();
});
});

View File

@ -0,0 +1,32 @@
require('app-module-path').addPath(__dirname);
const { time } = require('lib/time-utils.js');
const { fileContentEqual, setupDatabase, setupDatabaseAndSynchronizer, db, synchronizer, fileApi, sleep, clearDatabase, switchClient, syncTargetId, objectsEqual, checkThrowAsync } = require('test-utils.js');
const urlUtils = require('lib/urlUtils.js');
process.on('unhandledRejection', (reason, p) => {
console.log('Unhandled Rejection at: Promise', p, 'reason:', reason);
});
describe('urlUtils', function() {
beforeEach(async (done) => {
done();
});
it('should prepend a base URL', async (done) => {
expect(urlUtils.prependBaseUrl('testing.html', 'http://example.com')).toBe('http://example.com/testing.html');
expect(urlUtils.prependBaseUrl('testing.html', 'http://example.com/')).toBe('http://example.com/testing.html');
expect(urlUtils.prependBaseUrl('/jmp/?id=123&u=http://something.com/test', 'http://example.com/')).toBe('http://example.com/jmp/?id=123&u=http://something.com/test');
expect(urlUtils.prependBaseUrl('/testing.html', 'http://example.com/')).toBe('http://example.com/testing.html');
expect(urlUtils.prependBaseUrl('/testing.html', 'http://example.com/something')).toBe('http://example.com/testing.html');
expect(urlUtils.prependBaseUrl('/testing.html', 'https://example.com/something')).toBe('https://example.com/testing.html');
expect(urlUtils.prependBaseUrl('//somewhereelse.com/testing.html', 'https://example.com/something')).toBe('https://somewhereelse.com/testing.html');
expect(urlUtils.prependBaseUrl('//somewhereelse.com/testing.html', 'http://example.com/something')).toBe('http://somewhereelse.com/testing.html');
expect(urlUtils.prependBaseUrl('', 'http://example.com/something')).toBe('http://example.com/something');
expect(urlUtils.prependBaseUrl('testing.html', '')).toBe('testing.html');
done();
});
});

File diff suppressed because it is too large Load Diff

View File

@ -386,6 +386,29 @@ class BaseApplication {
return os.homedir() + '/.config/' + Setting.value('appName');
}
async testing() {
const ClipperServer = require('lib/ClipperServer');
const server = new ClipperServer();
const HtmlToMd = require('lib/HtmlToMd');
const service = new HtmlToMd();
const html = await shim.fsDriver().readFile('/mnt/d/test.html');
let markdown = service.parse(html);
console.info(markdown);
console.info('--------------------------------------------------');
const imageUrls = server.extractImageUrls(markdown);
let result = await server.downloadImages(imageUrls);
result = await server.createResourcesFromPaths(result);
console.info(result);
markdown = server.replaceImageUrlByResources(markdown, result);
console.info('--------------------------------------------------');
console.info(markdown);
console.info('--------------------------------------------------');
}
async start(argv) {
let startFlags = await this.handleStartFlags_(argv);
@ -467,6 +490,8 @@ class BaseApplication {
if (!currentFolder) currentFolder = await Folder.defaultFolder();
Setting.setValue('activeFolderId', currentFolder ? currentFolder.id : '');
// await this.testing();process.exit();
this.clipperServer_ = new ClipperServer();
this.clipperServer_.start();

View File

@ -2,101 +2,121 @@ const { netUtils } = require('lib/net-utils');
const urlParser = require("url");
const Note = require('lib/models/Note');
const Folder = require('lib/models/Folder');
const Resource = require('lib/models/Resource');
const Setting = require('lib/models/Setting');
const { shim } = require('lib/shim');
const md5 = require('md5');
const { fileExtension, safeFileExtension, filename } = require('lib/path-utils');
const HtmlToMd = require('lib/HtmlToMd');
const { Logger } = require('lib/logger.js');
class ClipperServer {
constructor() {
this.logger_ = new Logger();
}
setLogger(l) {
this.logger_ = l;
}
logger() {
return this.logger_;
}
htmlToMdParser() {
if (this.htmlToMdParser_) return this.htmlToMdParser_;
this.htmlToMdParser_ = new HtmlToMd();
return this.htmlToMdParser_;
}
readabilityProcess(url) {
return new Promise((resolve, reject) => {
// const Readability = require('readability-node').Readability;
async requestNoteToNote(requestNote) {
const output = {
title: requestNote.title ? requestNote.title : '',
body: requestNote.body ? requestNote.body : '',
};
// var location = document.location;
// var uri = {
// spec: location.href,
// host: location.host,
// prePath: location.protocol + "//" + location.host,
// scheme: location.protocol.substr(0, location.protocol.indexOf(":")),
// pathBase: location.protocol + "//" + location.host + location.pathname.substr(0, location.pathname.lastIndexOf("/") + 1)
// };
// var article = new Readability(uri, document).parse();
if (requestNote.bodyHtml) {
console.info(requestNote.bodyHtml);
// Parsing will not work if the HTML is not wrapped in a top level tag, which is not guaranteed
// when getting the content from elsewhere. So here wrap it - it won't change anything to the final
// rendering but it makes sure everything will be parsed.
output.body = await this.htmlToMdParser().parse('<div>' + requestNote.bodyHtml + '</div>', {
baseUrl: requestNote.baseUrl ? requestNote.baseUrl : '',
});
}
// const read = require('node-readability');
if (requestNote.parent_id) {
output.parent_id = requestNote.parent_id;
} else {
const folder = await Folder.defaultFolder();
if (!folder) throw new Error('Cannot find folder for note');
output.parent_id = folder.id;
}
// read(url, function(error, article, meta) {
// if (error) {
// reject(error);
// return;
// }
// const output = {
// body: article.content,
// title: article.title,
// }
// article.close();
// resolve(output);
// });
// // Main Article
// console.log(article.content);
// // Title
// console.log(article.title);
// // HTML Source Code
// console.log(article.html);
// // DOM
// console.log(article.document);
// // Response Object from Request Lib
// console.log(meta);
// // Close article to clean up jsdom and prevent leaks
// article.close();
});
return output;
}
async requestNoteToNote(requestNote) {
// if (requestNote.url) {
// console.info('Clipper: Got URL: ' + requestNote.url);
// const result = await this.readabilityProcess(requestNote.url);
// return {
// title: result.title,
// body: result.body,
// }
// } else {
const output = {
title: requestNote.title ? requestNote.title : '',
body: requestNote.body ? requestNote.body : '',
};
extractImageUrls_(md) {
// ![some text](http://path/to/image)
const regex = new RegExp(/!\[.*?\]\((http[s]?:\/\/.*?)\)/, 'g')
let match = regex.exec(md);
const output = [];
while (match) {
const url = match[1];
if (output.indexOf(url) < 0) output.push(url);
match = regex.exec(md);
}
return output;
}
if (requestNote.bodyHtml) {
console.info(requestNote.bodyHtml);
// Parsing will not work if the HTML is not wrapped in a top level tag, which is not guaranteed
// when getting the content from elsewhere. So here wrap it - it won't change anything to the final
// rendering but it makes sure everything will be parsed.
output.body = await this.htmlToMdParser().parse('<div>' + requestNote.bodyHtml + '</div>', {
baseUrl: requestNote.baseUrl ? requestNote.baseUrl : '',
});
async downloadImages_(urls) {
const tempDir = Setting.value('tempDir');
const output = {};
for (let i = 0; i < urls.length; i++) {
const url = urls[i];
const name = filename(url);
let fileExt = safeFileExtension(fileExtension(url).toLowerCase());
if (fileExt) fileExt = '.' + fileExt;
let imagePath = tempDir + '/' + name + fileExt;
if (await shim.fsDriver().exists(imagePath)) imagePath = tempDir + '/' + name + '_' + md5(Math.random() + '_' + Date.now()).substr(0,10) + fileExt;
try {
const result = await shim.fetchBlob(url, { path: imagePath });
output[url] = { path: imagePath };
} catch (error) {
this.logger().warn('ClipperServer: Cannot download image at ' + url, error);
}
}
if (requestNote.parent_id) {
output.parent_id = requestNote.parent_id;
} else {
const folder = await Folder.defaultFolder();
if (!folder) throw new Error('Cannot find folder for note');
output.parent_id = folder.id;
return output;
}
async createResourcesFromPaths_(urls) {
for (let url in urls) {
if (!urls.hasOwnProperty(url)) continue;
const urlInfo = urls[url];
try {
const resource = await shim.createResourceFromPath(urlInfo.path);
urlInfo.resource = resource;
} catch (error) {
this.logger().warn('ClipperServer: Cannot create resource for ' + url, error);
}
}
return urls;
}
return output;
// }
replaceImageUrlsByResources_(md, urls) {
let output = md.replace(/(!\[.*?\]\()(http[s]?:\/\/.*?)(\))/g, (match, before, imageUrl, after) => {
const urlInfo = urls[imageUrl];
if (!urlInfo || !urlInfo.resource) return imageUrl;
const resourceUrl = Resource.internalUrl(urlInfo.resource);
return before + resourceUrl + after;
});
return output;
}
async start() {
@ -118,12 +138,6 @@ class ClipperServer {
const writeResponseJson = (code, object) => {
writeCorsHeaders(code);
// response.writeHead(code, {
// "Content-Type": "application/json",
// 'Access-Control-Allow-Origin': '*',
// 'Access-Control-Allow-Methods': 'GET, POST, OPTIONS, PUT, PATCH, DELETE',
// 'Access-Control-Allow-Headers': 'X-Requested-With,content-type',
// });
response.write(JSON.stringify(object));
response.end();
}
@ -142,10 +156,18 @@ class ClipperServer {
request.on('end', async () => {
try {
// console.info('GOT BODY', body);
const requestNote = JSON.parse(body);
// console.info('GOT REQUEST',
let note = await this.requestNoteToNote(requestNote);
// TODO: Provide way to check status (importing image x/y)
// TODO: Delete temp file after import
// TODO: Download multiple images at once
const imageUrls = this.extractImageUrls_(note.body);
let result = await this.downloadImages_(imageUrls);
result = await this.createResourcesFromPaths_(result);
note.body = this.replaceImageUrlsByResources_(note.body, result);
note = await Note.save(note);
return writeResponseJson(200, note);
} catch (error) {

View File

@ -1,16 +1,18 @@
const TurndownService = require('joplin-turndown')
const markdownUtils = require('lib/markdownUtils');
class HtmlToMd {
parse(html) {
parse(html, options = {}) {
const turndownPluginGfm = require('joplin-turndown-plugin-gfm').gfm
const turndown = new TurndownService({
headingStyle: 'atx',
})
turndown.use(turndownPluginGfm)
turndown.remove('script');
let markdown = turndown.turndown(html)
return markdown;
let md = turndown.turndown(html)
if (options.baseUrl) md = markdownUtils.prependBaseUrl(md, options.baseUrl);
return md;
}
}

View File

@ -1,3 +1,5 @@
const urlUtils = require('lib/urlUtils');
const markdownUtils = {
// Not really escaping because that's not supported by marked.js
@ -11,6 +13,12 @@ const markdownUtils = {
return url;
},
prependBaseUrl(md, baseUrl) {
return md.replace(/(\]\()([^\s\)]+)(.*?\))/g, (match, before, url, after) => {
return before + urlUtils.prependBaseUrl(url, baseUrl) + after;
});
},
};
module.exports = { markdownUtils };
module.exports = markdownUtils;

View File

@ -6,7 +6,7 @@ const { time } = require('lib/time-utils.js');
const { sprintf } = require('sprintf-js');
const { _ } = require('lib/locale.js');
const moment = require('moment');
const { markdownUtils } = require('lib/markdown-utils.js');
const markdownUtils = require('lib/markdownUtils');
class BaseItem extends BaseModel {

View File

@ -7,7 +7,7 @@ const pathUtils = require('lib/path-utils.js');
const { mime } = require('lib/mime-utils.js');
const { filename } = require('lib/path-utils.js');
const { FsDriverDummy } = require('lib/fs-driver-dummy.js');
const { markdownUtils } = require('lib/markdown-utils.js');
const markdownUtils = require('lib/markdownUtils');
const JoplinError = require('lib/JoplinError');
class Resource extends BaseItem {
@ -135,6 +135,10 @@ class Resource extends BaseItem {
return lines.join('');
}
static internalUrl(resource) {
return ':/' + resource.id;
}
static pathToId(path) {
return filename(path);
}

View File

@ -96,7 +96,7 @@ function shimInit() {
}
}
shim.attachFileToNote = async function(note, filePath, position = null) {
shim.createResourceFromPath = async function(filePath) {
const Resource = require('lib/models/Resource.js');
const { uuid } = require('lib/uuid.js');
const { basename, fileExtension, safeFileExtension } = require('lib/path-utils.js');
@ -126,6 +126,41 @@ function shimInit() {
await Resource.save(resource, { isNew: true });
return resource;
}
shim.attachFileToNote = async function(note, filePath, position = null) {
// const Resource = require('lib/models/Resource.js');
// const { uuid } = require('lib/uuid.js');
// const { basename, fileExtension, safeFileExtension } = require('lib/path-utils.js');
// const mime = require('mime/lite');
// const Note = require('lib/models/Note.js');
// if (!(await fs.pathExists(filePath))) throw new Error(_('Cannot access %s', filePath));
// let resource = Resource.new();
// resource.id = uuid.create();
// resource.mime = mime.getType(filePath);
// resource.title = basename(filePath);
// resource.file_extension = safeFileExtension(fileExtension(filePath));
// if (!resource.mime) resource.mime = 'application/octet-stream';
// let targetPath = Resource.fullPath(resource);
// if (resource.mime == 'image/jpeg' || resource.mime == 'image/jpg' || resource.mime == 'image/png') {
// const result = await resizeImage_(filePath, targetPath, resource.mime);
// } else {
// const stat = await shim.fsDriver().stat(filePath);
// if (stat.size >= 10000000) throw new Error('Resources larger than 10 MB are not currently supported as they may crash the mobile applications. The issue is being investigated and will be fixed at a later time.');
// await fs.copy(filePath, targetPath, { overwrite: true });
// }
// await Resource.save(resource, { isNew: true });
const resource = shim.createResourceFromPath(filePath);
const newBody = [];
if (position === null) {

View File

@ -1,3 +1,5 @@
const { rtrimSlashes } = require('lib/path-utils');
const urlUtils = {};
urlUtils.hash = function(url) {
@ -6,4 +8,33 @@ urlUtils.hash = function(url) {
return s[s.length - 1];
}
urlUtils.urlWithoutPath = function(url) {
const parsed = require('url').parse(url, true);
return parsed.protocol + '//' + parsed.host;
}
urlUtils.urlProtocol = function(url) {
const parsed = require('url').parse(url, true);
return parsed.protocol;
}
const schemeRegex = /^[a-zA-Z0-9\+\-\.]+:\/\//
urlUtils.prependBaseUrl = function(url, baseUrl) {
baseUrl = rtrimSlashes(baseUrl).trim(); // All the code below assumes that the baseUrl does not end up with a slash
url = url.trim();
if (!url) url = '';
if (!baseUrl) return url;
const matches = schemeRegex.exec(url);
if (matches) return url; // Don't prepend the base URL if the URL already has a scheme
if (url.length >= 2 && url.indexOf('//') === 0) { // If it starts with // it's a protcol-relative URL
return urlUtils.urlProtocol(baseUrl) + url;
} else if (url && url[0] === '/') { // If it starts with a slash, it's an absolute URL so it should be relative to the domain (and not to the full baseUrl)
return urlUtils.urlWithoutPath(baseUrl) + url;
} else {
return baseUrl + (url ? '/' + url : '');
}
}
module.exports = urlUtils;

View File

@ -6082,6 +6082,11 @@
"strict-uri-encode": "1.1.0"
}
},
"querystring": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/querystring/-/querystring-0.2.0.tgz",
"integrity": "sha1-sgmEkgO7Jd+CDadW50cAWHhSFiA="
},
"querystringify": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/querystringify/-/querystringify-1.0.0.tgz",
@ -8307,6 +8312,22 @@
"resolved": "https://registry.npmjs.org/urix/-/urix-0.1.0.tgz",
"integrity": "sha1-2pN/emLiH+wf0Y1Js1wpNQZ6bHI="
},
"url": {
"version": "0.11.0",
"resolved": "https://registry.npmjs.org/url/-/url-0.11.0.tgz",
"integrity": "sha1-ODjpfPxgUh63PFJajlW/3Z4uKPE=",
"requires": {
"punycode": "1.3.2",
"querystring": "0.2.0"
},
"dependencies": {
"punycode": {
"version": "1.3.2",
"resolved": "https://registry.npmjs.org/punycode/-/punycode-1.3.2.tgz",
"integrity": "sha1-llOgNvt8HuQjQvIyXM7v6jkmxI0="
}
}
},
"url-parse": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.2.0.tgz",

View File

@ -48,6 +48,7 @@
"stream": "0.0.2",
"string-natural-compare": "^2.0.2",
"timers": "^0.1.1",
"url": "^0.11.0",
"url-parse": "^1.2.0",
"uuid": "^3.0.1",
"valid-url": "^1.0.9",