1
0
mirror of https://github.com/laurent22/joplin.git synced 2024-12-24 10:27:10 +02:00

Clipper: Improved Html To Md conversion

This commit is contained in:
Laurent Cozic 2018-05-22 00:54:23 +01:00
parent 7ed9c2770c
commit 7cf267254f
20 changed files with 1216 additions and 1136 deletions

View File

@ -885,6 +885,19 @@
"integrity": "sha1-vMl5rh+f0FcB5F5S5l06XWPxok4=", "integrity": "sha1-vMl5rh+f0FcB5F5S5l06XWPxok4=",
"dev": true "dev": true
}, },
"joplin-turndown": {
"version": "4.0.3",
"resolved": "https://registry.npmjs.org/joplin-turndown/-/joplin-turndown-4.0.3.tgz",
"integrity": "sha512-WbAXje8wq4/ZLNtPDUFBEtG5zKEbz7Wth5N3vB4Nw7k+PUs3mMF49LVEPP7Kc6H4Ui671qdjpSShvdsmiLY2gA==",
"requires": {
"jsdom": "^11.9.0"
}
},
"joplin-turndown-plugin-gfm": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/joplin-turndown-plugin-gfm/-/joplin-turndown-plugin-gfm-1.0.2.tgz",
"integrity": "sha512-GRXmjHFrEyUnXOYzOZvUGGtKxPm5LuK98+73ZADqQYdGzMWp/o8Qx22YYAeIBsOV2WtVsRxe2IpUGBG4foSRyQ=="
},
"jpeg-js": { "jpeg-js": {
"version": "0.1.2", "version": "0.1.2",
"resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.1.2.tgz", "resolved": "https://registry.npmjs.org/jpeg-js/-/jpeg-js-0.1.2.tgz",
@ -2451,19 +2464,6 @@
"safe-buffer": "^5.0.1" "safe-buffer": "^5.0.1"
} }
}, },
"turndown": {
"version": "4.0.2",
"resolved": "https://registry.npmjs.org/turndown/-/turndown-4.0.2.tgz",
"integrity": "sha512-pqZ6WrHFGnxXC9q2xJ3Qa7EoLAwrojgFRajWZjxTKwbz9vnNnyi8lLjiD5h86UTPOcMlEyHjm6NMhjEDdlc25A==",
"requires": {
"jsdom": "^11.9.0"
}
},
"turndown-plugin-gfm": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/turndown-plugin-gfm/-/turndown-plugin-gfm-1.0.2.tgz",
"integrity": "sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg=="
},
"tweetnacl": { "tweetnacl": {
"version": "0.14.5", "version": "0.14.5",
"resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz", "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz",

View File

@ -36,6 +36,8 @@
"fs-extra": "^5.0.0", "fs-extra": "^5.0.0",
"html-entities": "^1.2.1", "html-entities": "^1.2.1",
"html-minifier": "^3.5.15", "html-minifier": "^3.5.15",
"joplin-turndown": "^4.0.3",
"joplin-turndown-plugin-gfm": "^1.0.2",
"jssha": "^2.3.0", "jssha": "^2.3.0",
"levenshtein": "^1.0.5", "levenshtein": "^1.0.5",
"lodash": "^4.17.4", "lodash": "^4.17.4",
@ -60,8 +62,6 @@
"tar": "^4.4.0", "tar": "^4.4.0",
"tcp-port-used": "^0.1.2", "tcp-port-used": "^0.1.2",
"tkwidgets": "^0.5.26", "tkwidgets": "^0.5.26",
"turndown": "^4.0.2",
"turndown-plugin-gfm": "^1.0.2",
"url-parse": "^1.2.0", "url-parse": "^1.2.0",
"uuid": "^3.0.1", "uuid": "^3.0.1",
"valid-url": "^1.0.9", "valid-url": "^1.0.9",

View File

@ -24,7 +24,7 @@ describe('HtmlToMd', function() {
done(); done();
}); });
it('should convert from Enex to Markdown', asyncTest(async () => { it('should convert from Html to Markdown', asyncTest(async () => {
const basePath = __dirname + '/html_to_md'; const basePath = __dirname + '/html_to_md';
const files = await shim.fsDriver().readDirStats(basePath); const files = await shim.fsDriver().readDirStats(basePath);
const htmlToMd = new HtmlToMd(); const htmlToMd = new HtmlToMd();
@ -36,7 +36,7 @@ describe('HtmlToMd', function() {
const htmlPath = basePath + '/' + htmlFilename; const htmlPath = basePath + '/' + htmlFilename;
const mdPath = basePath + '/' + filename(htmlFilename) + '.md'; const mdPath = basePath + '/' + filename(htmlFilename) + '.md';
if (htmlFilename !== 'table_no_header.html') continue; // if (htmlFilename !== 'anchor_with_newlines.html') continue;
const html = await shim.fsDriver().readFile(htmlPath); const html = await shim.fsDriver().readFile(htmlPath);
const expectedMd = await shim.fsDriver().readFile(mdPath); const expectedMd = await shim.fsDriver().readFile(mdPath);
@ -47,6 +47,8 @@ describe('HtmlToMd', function() {
console.info(''); console.info('');
console.info('Error converting file: ' + htmlFilename); console.info('Error converting file: ' + htmlFilename);
console.info('--------------------------------- Got:'); console.info('--------------------------------- Got:');
console.info(actualMd);
console.info('--------------------------------- Raw:');
console.info(actualMd.split('\n')); console.info(actualMd.split('\n'));
console.info('--------------------------------- Expected:'); console.info('--------------------------------- Expected:');
console.info(expectedMd.split('\n')); console.info(expectedMd.split('\n'));

View File

@ -0,0 +1 @@
<a href="https://joplin.cozic.net"><h1 id="joplin"><img class="title-icon" src="https://joplin.cozic.net/images/Icon512.png">oplin</h1></a>

View File

@ -0,0 +1 @@
[# ![](https://joplin.cozic.net/images/Icon512.png)oplin](https://joplin.cozic.net)

View File

@ -0,0 +1 @@
<a href="http://example.com"><p>That</p><p>Shouldn't be allowed</p></a>

View File

@ -0,0 +1 @@
[That<br>Shouldn't be allowed](http://example.com)

View File

@ -0,0 +1,10 @@
<table>
<tr>
<td></td>
<td>Previous is empty</td>
</tr>
<tr>
<td>Next is empty</td>
<td></td>
</tr>
</table>

View File

@ -0,0 +1,4 @@
| | |
| --- | --- |
| | Previous is empty |
| Next is empty | |

View File

@ -0,0 +1,13 @@
<table>
<tr>
<td>One</td><td>Two</td>
</tr>
<tr></tr>
<tr>
<td>One</td><td>Two</td>
</tr>
<tr></tr>
<tr>
<td>One</td><td>Two</td>
</tr>
</table>

View File

@ -0,0 +1,5 @@
| | |
| --- | --- |
| One | Two |
| One | Two |
| One | Two |

View File

@ -0,0 +1,6 @@
<table>
<tr>
<td><p>Some paragraph</p><p>inside a table cell</p></td>
<td>Second column</td>
</tr>
</table>

View File

@ -0,0 +1,3 @@
| | |
| --- | --- |
| Some paragraph<br><br>inside a table cell | Second column |

View File

@ -0,0 +1,16 @@
<!--
The inner table is rendered but not the outer one.
Basically if any table contains another table, it is rendered as plain text
-->
<table>
<tr><td>
First column, and an inner table:
<table>
<tr><td>One</td><td>Two</td></tr>
<tr><td>One</td><td>Two</td></tr>
</table>
</td>
<td>Second column</td>
</tr>
</table>

View File

@ -0,0 +1,8 @@
First column, and an inner table:
| | |
| --- | --- |
| One | Two |
| One | Two |
Second column

File diff suppressed because it is too large Load Diff

View File

@ -87,6 +87,8 @@
"fs-extra": "^5.0.0", "fs-extra": "^5.0.0",
"highlight.js": "^9.12.0", "highlight.js": "^9.12.0",
"html-entities": "^1.2.1", "html-entities": "^1.2.1",
"joplin-turndown": "^4.0.3",
"joplin-turndown-plugin-gfm": "^1.0.5",
"jssha": "^2.3.1", "jssha": "^2.3.1",
"katex": "^0.9.0-beta1", "katex": "^0.9.0-beta1",
"levenshtein": "^1.0.5", "levenshtein": "^1.0.5",
@ -116,8 +118,6 @@
"string-to-stream": "^1.1.0", "string-to-stream": "^1.1.0",
"tar": "^4.4.0", "tar": "^4.4.0",
"tcp-port-used": "^0.1.2", "tcp-port-used": "^0.1.2",
"turndown": "^4.0.2",
"turndown-plugin-gfm": "^1.0.2",
"url-parse": "^1.2.0", "url-parse": "^1.2.0",
"uuid": "^3.1.0", "uuid": "^3.1.0",
"valid-url": "^1.0.9", "valid-url": "^1.0.9",

View File

@ -1,10 +1,12 @@
const TurndownService = require('turndown') const TurndownService = require('joplin-turndown')
class HtmlToMd { class HtmlToMd {
parse(html) { parse(html) {
const turndownPluginGfm = require('turndown-plugin-gfm').gfm const turndownPluginGfm = require('joplin-turndown-plugin-gfm').gfm
const turndown = new TurndownService() const turndown = new TurndownService({
headingStyle: 'atx',
})
turndown.use(turndownPluginGfm) turndown.use(turndownPluginGfm)
turndown.remove('script'); turndown.remove('script');
let markdown = turndown.turndown(html) let markdown = turndown.turndown(html)

View File

@ -2,5 +2,7 @@
ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
cd "$ROOT_DIR/CliClient/node_modules" cd "$ROOT_DIR/CliClient/node_modules"
rm -rf tkwidgets rm -rf tkwidgets joplin-turndown joplin-turndown-plugin-gfm
ln -s /mnt/d/Docs/PROGS/Node/tkwidgets/src tkwidgets ln -s /mnt/d/Docs/PROGS/Node/tkwidgets/src tkwidgets
ln -s /mnt/d/Temp/turndown-plugin-gfm joplin-turndown-plugin-gfm
ln -s /mnt/d/Temp/turndown joplin-turndown