1
0
mirror of https://github.com/laurent22/joplin.git synced 2025-04-11 11:12:03 +02:00

xml parsgin

This commit is contained in:
Laurent Cozic 2017-06-07 20:21:04 +01:00
parent 96c3c20991
commit 4b27aba0d8
4 changed files with 453 additions and 169 deletions

View File

@ -8,6 +8,388 @@ const Promise = require('promise');
const fs = require('fs');
const xml2js = require("xml2js");
const BLOCK_OPEN = "<div>";
const BLOCK_CLOSE = "</div>";
const NEWLINE = "<br/>";
const NEWLINE_MERGED = "<merged/>";
const SPACE = "<space/>";
function processMdArrayNewLines(md) {
while (md.length && md[0] == BLOCK_OPEN) {
md.shift();
}
while (md.length && md[md.length - 1] == BLOCK_CLOSE) {
md.pop();
}
let temp = [];
let last = '';
for (let i = 0; i < md.length; i++) { let v = md[i];
if (isNewLineBlock(last) && isNewLineBlock(v) && last == v) {
// Skip it
} else {
temp.push(v);
}
last = v;
}
md = temp;
temp = [];
last = "";
for (let i = 0; i < md.length; i++) { let v = md[i];
if (last == BLOCK_CLOSE && v == BLOCK_OPEN) {
temp.pop();
temp.push(NEWLINE_MERGED);
} else {
temp.push(v);
}
last = v;
}
md = temp;
temp = [];
last = "";
for (let i = 0; i < md.length; i++) { let v = md[i];
if (last == NEWLINE && (v == NEWLINE_MERGED || v == BLOCK_CLOSE)) {
// Skip it
} else {
temp.push(v);
}
last = v;
}
md = temp;
// NEW!!!
temp = [];
last = "";
for (let i = 0; i < md.length; i++) { let v = md[i];
if (last == NEWLINE && (v == NEWLINE_MERGED || v == BLOCK_OPEN)) {
// Skip it
} else {
temp.push(v);
}
last = v;
}
md = temp;
if (md.length > 2) {
if (md[md.length - 2] == NEWLINE_MERGED && md[md.length - 1] == NEWLINE) {
md.pop();
}
}
let output = '';
let previous = '';
let start = true;
for (let i = 0; i < md.length; i++) { let v = md[i];
let add = '';
if (v == BLOCK_CLOSE || v == BLOCK_OPEN || v == NEWLINE || v == NEWLINE_MERGED) {
add = "\n";
} else if (v == SPACE) {
if (previous == SPACE || previous == "\n" || start) {
continue; // skip
} else {
add = " ";
}
} else {
add = v;
}
start = false;
output += add;
previous = add;
}
if (!output.trim().length) return '';
return output;
}
function isWhiteSpace(c) {
return c == '\n' || c == '\r' || c == '\v' || c == '\f' || c == '\t' || c == ' ';
}
// Like QString::simpified(), except that it preserves non-breaking spaces (which
// Evernote uses for identation, etc.)
function simplifyString(s) {
let output = '';
let previousWhite = false;
for (let i = 0; i < s.length; i++) {
let c = s[i];
let isWhite = isWhiteSpace(c);
if (previousWhite && isWhite) {
// skip
} else {
output += c;
}
previousWhite = isWhite;
}
while (output.length && isWhiteSpace(output[0])) output = output.substr(1);
while (output.length && isWhiteSpace(output[output.length - 1])) output = output.substr(0, output.length - 1);
return output;
}
function collapseWhiteSpaceAndAppend(lines, state, text) {
if (state.inCode) {
text = "\t" + text;
if (text === undefined) console.info('AAAAAAAAAA');
lines.push(text);
} else {
// Remove all \n and \r from the left and right of the text
while (text.length && (text[0] == "\n" || text[0] == "\r")) text = text.substr(1);
while (text.length && (text[text.length - 1] == "\n" || text[text.length - 1] == "\r")) text = text.substr(0, text.length - 1);
// Collapse all white spaces to just one. If there are spaces to the left and right of the string
// also collapse them to just one space.
let spaceLeft = text.length && text[0] == ' ';
let spaceRight = text.length && text[text.length - 1] == ' ';
text = simplifyString(text);
if (!spaceLeft && !spaceRight && text == "") return lines;
if (spaceLeft) lines.push(SPACE);
if (text === undefined) console.info('BBBBBBB');
lines.push(text);
if (spaceRight) lines.push(SPACE);
}
return lines;
}
const imageMimeTypes = ["image/cgm", "image/fits", "image/g3fax", "image/gif", "image/ief", "image/jp2", "image/jpeg", "image/jpm", "image/jpx", "image/naplps", "image/png", "image/prs.btif", "image/prs.pti", "image/t38", "image/tiff", "image/tiff-fx", "image/vnd.adobe.photoshop", "image/vnd.cns.inf2", "image/vnd.djvu", "image/vnd.dwg", "image/vnd.dxf", "image/vnd.fastbidsheet", "image/vnd.fpx", "image/vnd.fst", "image/vnd.fujixerox.edmics-mmr", "image/vnd.fujixerox.edmics-rlc", "image/vnd.globalgraphics.pgb", "image/vnd.microsoft.icon", "image/vnd.mix", "image/vnd.ms-modi", "image/vnd.net-fpx", "image/vnd.sealed.png", "image/vnd.sealedmedia.softseal.gif", "image/vnd.sealedmedia.softseal.jpg", "image/vnd.svf", "image/vnd.wap.wbmp", "image/vnd.xiff"];
function isImageMimeType(m) {
return imageMimeTypes.indexOf(m) >= 0;
}
function addResourceTag(lines, resource, alt = "") {
let tagAlt = alt == "" ? resource.alt : alt;
if (isImageMimeType(resource.mime)) {
lines.push("![");
lines.push(tagAlt);
lines.push("](:/" + resource.id + ")");
} else {
lines.push("[");
lines.push(tagAlt);
lines.push("](:/" + resource.id + ")");
}
return lines;
}
function enexXmlToMd(stream) {
return new Promise((resolve, reject) => {
let output = [];
let state = {
inCode: false,
lists: [],
anchorAttributes: [],
};
let options = {};
let strict = true;
var saxStream = require('sax').createStream(strict, options)
saxStream.on('error', function(e) {
reject(e);
})
saxStream.on('text', function(text) {
output = collapseWhiteSpaceAndAppend(output, state, text);
})
saxStream.on('opentag', function(node) {
let n = node.name.toLowerCase();
if (n == 'en-note') {
// Start of note
} else if (isBlockTag(n)) {
output.push(BLOCK_OPEN);
} else if (isListTag(n)) {
output.push(BLOCK_OPEN);
state.lists.push({ tag: n, counter: 1 });
} else if (n == 'li') {
output.push(BLOCK_OPEN);
if (!state.lists.length) {
reject("Found <li> tag without being inside a list"); // TODO: could be a warning, but nothing to handle warnings at the moment
return;
}
let container = state.lists[state.lists.length - 1];
if (container.tag == "ul") {
output.push("- ");
} else {
output.push(container.counter + '. ');
container.counter++;
}
} else if (isStrongTag(n)) {
output.push("**");
} else if (isAnchor(n)) {
state.anchorAttributes.push(node.attributes);
output.push('[');
} else if (isEmTag(n)) {
output.push("*");
} else if (n == "en-todo") {
let x = node.attributes && node.attributes.checked.toLowerCase() == 'true' ? 'X' : ' ';
output.push('- [' + x + '] ');
} else if (n == "h1") {
output.push(BLOCK_OPEN); output.push("# ");
} else if (n == "h2") {
output.push(BLOCK_OPEN); output.push("## ");
} else if (n == "h3") {
output.push(BLOCK_OPEN); output.push("### ");
} else if (n == "h4") {
output.push(BLOCK_OPEN); output.push("#### ");
} else if (n == "h5") {
output.push(BLOCK_OPEN); output.push("##### ");
} else if (n == "h6") {
output.push(BLOCK_OPEN); output.push("###### ");
} else if (isCodeTag(n)) {
output.push(BLOCK_OPEN);
state.inCode = true;
} else if (n == "br") {
output.push(NEWLINE);
} else if (n == "en-media") {
console.warn('TODO: en-media');
// attrs = attributesLIFO.back();
// QString hash = attrs["hash"];
// Resource resource;
// for (int i = 0; i < state.resources.size(); i++) {
// Resource r = state.resources[i];
// if (r.id == hash) {
// resource = r;
// state.resources.erase(state.resources.begin() + i);
// break;
// }
// }
// // If the resource does not appear among the note's resources, it
// // means it's an attachement. It will be appended along with the
// // other remaining resources at the bottom of the markdown text.
// if (resource.id != "") {
// addResourceTag(lines, resource, attrs["alt"]);
// }
} else if (n == "span" || n == "font") {
// Ignore
} else {
reject("Unsupported start tag:" + n); // TODO: should be a warning
}
})
saxStream.on('closetag', function(n) {
if (n == 'en-note') {
// End of note
} else if (isNewLineOnlyEndTag(n)) {
output.push(BLOCK_CLOSE);
} else if (isIgnoredEndTag(n)) {
// Skip
} else if (isListTag(n)) {
output.push(BLOCK_CLOSE);
state.lists.pop();
} else if (isStrongTag(n)) {
output.push("**");
} else if (isEmTag(n)) {
output.push("*");
} else if (isCodeTag(n)) {
state.inCode = false;
output.push(BLOCK_CLOSE);
} else if (isAnchor(n)) {
let attributes = state.anchorAttributes.pop();
let url = attributes && attributes.href ? attributes.href : '';
output.push('](' + url + ')');
} else if (isListTag(n)) {
output.push(BLOCK_CLOSE);
state.lists.pop();
} else if (n == "en-media") {
// Skip
} else if (isIgnoredEndTag(n)) {
// Skip
} else {
reject("Unsupported end tag:" + n); // TODO: should be a warning
}
})
saxStream.on('attribute', function(attr) {
})
saxStream.on('end', function() {
resolve(output);
})
stream.pipe(saxStream);
});
}
const path = require('path');
var walk = function (dir, done) {
fs.readdir(dir, function (error, list) {
if (error) return done(error);
var i = 0;
(function next () {
var file = list[i++];
if (!file) return done(null);
file = dir + '/' + file;
fs.stat(file, function (error, stat) {
if (stat && stat.isDirectory()) {
walk(file, function (error) {
next();
});
} else {
if (path.basename(file) != 'sample4.xml') {
next();
return;
}
if (path.extname(file) == '.xml') {
console.info('Processing: ' + file);
let stream = fs.createReadStream(file);
enexXmlToMd(stream).then((md) => {
console.info(md);
console.info(processMdArrayNewLines(md));
next();
}).catch((error) => {
console.error(error);
return done(error);
});
} else {
next();
}
}
});
})();
});
};
walk('/home/laurent/Dropbox/Samples/', function(error) {
if (error) {
throw error;
} else {
console.log('-------------------------------------------------------------');
console.log('finished.');
console.log('-------------------------------------------------------------');
}
});
function parseXml(xml) {
return new Promise((resolve, reject) => {
xml2js.parseString(xml, (err, result) => {
@ -32,12 +414,48 @@ function readFile(path, options = null) {
});
}
function isBlockTag(n) {
return n=="div" || n=="p" || n=="dl" || n=="dd" || n=="center" || n=="table" || n=="tr" || n=="td" || n=="th" || n=="tbody";
}
function isStrongTag(n) {
return n == "strong" || n == "b";
}
function isEmTag(n) {
return n == "em" || n == "i" || n == "u";
}
function isAnchor(n) {
return n == "a";
}
function isIgnoredEndTag(n) {
return n=="en-note" || n=="en-todo" || n=="span" || n=="body" || n=="html" || n=="font" || n=="br";
}
function isListTag(n) {
return n == "ol" || n == "ul";
}
// Elements that don't require any special treatment beside adding a newline character
function isNewLineOnlyEndTag(n) {
return n=="div" || n=="p" || n=="li" || n=="h1" || n=="h2" || n=="h3" || n=="h4" || n=="h5" || n=="dl" || n=="dd" || n=="center" || n=="table" || n=="tr" || n=="td" || n=="th" || n=="tbody";
}
function isCodeTag(n) {
return n == "pre" || n == "code";
}
function isNewLineBlock(s) {
return s == BLOCK_OPEN || s == BLOCK_CLOSE;
}
function xmlNodeText(xmlNode) {
if (!xmlNode || !xmlNode.length) return '';
return xmlNode[0];
}
function dateToTimestamp(s) {
let m = moment(s, 'YYYYMMDDTHHmmssZ');
if (!m.isValid()) {
@ -46,39 +464,15 @@ function dateToTimestamp(s) {
return Math.round(m.toDate().getTime() / 1000);
}
function xmlToMd(xml) {
function evernoteXmlToMdArray(xml) {
return parseXml(xml).then((xml) => {
console.info(xml);
});
}
let contentTest = `
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">
<en-note style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">
Hello, World.
<div>
<br/>
</div>
<div>
<en-media alt="" type="image/jpeg" hash="dd7b6d285d09ec054e8cd6a3814ce093"/>
</div>
<div>
<br/>
</div>
</en-note>
`;
xmlToMd(contentTest).then((md) => {
console.info(md);
});
function toApiNote(xml) {
let o = {};
//console.info(xml);
o.id = uuid.create();
o.title = xmlNodeText(xml.title);
@ -101,31 +495,11 @@ function toApiNote(xml) {
o.tags = [];
if (xml.tag && xml.tag.length) o.tags = xml.tag;
//console.info(o);
return o;
}
// `id` binary(16) NOT NULL,
// `completed` tinyint(1) NOT NULL default '0',
// `created_time` int(11) NOT NULL default '0',
// `updated_time` int(11) NOT NULL default '0',
// `latitude` DECIMAL(10, 8) NOT NULL default '0',
// `longitude` DECIMAL(11, 8) NOT NULL default '0',
// `altitude` DECIMAL(9, 4) NOT NULL default '0',
// `parent_id` binary(16) NULL default NULL,
// `owner_id` binary(16),
// `is_encrypted` tinyint(1) NOT NULL default '0',
// `encryption_method` int(11) NOT NULL default '0',
// `order` int(11) NOT NULL default '0',
// `is_todo` tinyint(1) NOT NULL default '0',
// `todo_due` int(11) NOT NULL default '0',
// `todo_completed` int(11) NOT NULL default '0',
// `application_data` varchar(1024) NOT NULL DEFAULT "",
// `author` varchar(512) NOT NULL DEFAULT "",
// `source` varchar(512) NOT NULL DEFAULT "",
// `source_application` varchar(512) NOT NULL DEFAULT "",
// `source_url` varchar(1024) NOT NULL DEFAULT "",
// readFile('sample.enex', 'utf8').then((content) => {
// return parseXml(content);
@ -155,97 +529,4 @@ function toApiNote(xml) {
// password: '12345678',
// }).then((session) => {
// console.info(session);
// });
// <?xml version="1.0" encoding="UTF-8"?>
// <!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd">
// <en-export export-date="20130730T205637Z" application="Evernote" version="Evernote Mac">
// <note>
// <title>Test Note for Export</title>
// <content>
// <![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
// <!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">
// <en-note style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">
// Hello, World.
// <div>
// <br/>
// </div>
// <div>
// <en-media alt="" type="image/jpeg" hash="dd7b6d285d09ec054e8cd6a3814ce093"/>
// </div>
// <div>
// <br/>
// </div>
// </en-note>
// ]]>
// </content>
// <created>20130730T205204Z</created>
// <updated>20130730T205624Z</updated>
// <tag>fake-tag</tag>
// <note-attributes>
// <latitude>33.88394692352314</latitude>
// <longitude>-117.9191355110099</longitude>
// <altitude>96</altitude>
// <author>Brett Kelly</author>
// </note-attributes>
// <resource>
// <data encoding="base64">/9j/4AAQSkZJRgABAQAAAQABAAD/4gxYSUNDX1BST0ZJTEUAAQEAAAxITGlubwIQAABtbnRyUkdCIFhZ
// WiAHzgACAAkABgAxAABhY3NwTVNGVAAAAABJRUMgc1JHQgAAAAAAAAAAAAAAAAAA9tYAAQAAAADTLUhQ
// <!-- ... -->
// kfeIGT/+uufk8DpM0gyVjGfmzkgetesnUoTHJ+5Cxn86zmv4/wB75EW+QHAPUH/P9Ky+s1rtrr/wfvOm
// dBSamnq/xPKp/hpLKmS7x4OBjgn6elee6v4OuLJirRSHb/FtyG9s9u1fR0+oTiIRvGq7W4bpisfUGk1C
// GVWtkIyM57n1rfDY+uqigtU76ffZkUsA6iajHZ6v/P8A4B//2Q==</data>
// <mime>image/jpeg</mime>
// <width>1280</width>
// <height>720</height>
// <resource-attributes>
// <file-name>snapshot-DAE9FC15-88E3-46CF-B744-DA9B1B56EB57.jpg</file-name>
// </resource-attributes>
// </resource>
// </note>
// <note>
// <title>Test Note for Export</title>
// <content>
// <![CDATA[<?xml version="1.0" encoding="UTF-8" standalone="no"?>
// <!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd">
// <en-note style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">
// Hello, World.
// <div>
// <br/>
// </div>
// <div>
// <en-media alt="" type="image/jpeg" hash="dd7b6d285d09ec054e8cd6a3814ce093"/>
// </div>
// <div>
// <br/>
// </div>
// </en-note>
// ]]>
// </content>
// <created>20130730T205204Z</created>
// <updated>20130730T205624Z</updated>
// <tag>fake-tag</tag>
// <note-attributes>
// <latitude>33.88394692352314</latitude>
// <longitude>-117.9191355110099</longitude>
// <altitude>96</altitude>
// <author>Brett Kelly</author>
// </note-attributes>
// <resource>
// <data encoding="base64">/9j/4AAQSkZJRgABAQAAAQABAAD/4gxYSUNDX1BST0ZJTEUAAQEAAAxITGlubwIQAABtbnRyUkdCIFhZ
// WiAHzgACAAkABgAxAABhY3NwTVNGVAAAAABJRUMgc1JHQgAAAAAAAAAAAAAAAAAA9tYAAQAAAADTLUhQ
// <!-- ... -->
// kfeIGT/+uufk8DpM0gyVjGfmzkgetesnUoTHJ+5Cxn86zmv4/wB75EW+QHAPUH/P9Ky+s1rtrr/wfvOm
// dBSamnq/xPKp/hpLKmS7x4OBjgn6elee6v4OuLJirRSHb/FtyG9s9u1fR0+oTiIRvGq7W4bpisfUGk1C
// GVWtkIyM57n1rfDY+uqigtU76ffZkUsA6iajHZ6v/P8A4B//2Q==</data>
// <mime>image/jpeg</mime>
// <width>1280</width>
// <height>720</height>
// <resource-attributes>
// <file-name>snapshot-DAE9FC15-88E3-46CF-B744-DA9B1B56EB57.jpg</file-name>
// </resource-attributes>
// </resource>
// </note>
// </en-export>
// });

View File

@ -1,26 +1,28 @@
{
"name": "CliClient",
"version": "0.0.1",
"private": true,
"dependencies": {
"app-module-path": "^2.2.0",
"form-data": "^2.1.4",
"moment": "^2.18.1",
"node-fetch": "^1.7.1",
"promise": "^7.1.1",
"react": "16.0.0-alpha.6",
"uuid": "^3.0.1",
"xml2js": "^0.4.17"
},
"devDependencies": {
"babel-changed": "^7.0.0",
"babel-cli": "^6.24.1",
"babel-preset-env": "^1.5.1",
"babel-preset-react": "^6.24.1",
"query-string": "4.3.4"
},
"scripts": {
"build": "babel-changed app -d build",
"clean": "babel-changed --reset"
}
"name": "CliClient",
"version": "0.0.1",
"private": true,
"dependencies": {
"app-module-path": "^2.2.0",
"form-data": "^2.1.4",
"moment": "^2.18.1",
"node-fetch": "^1.7.1",
"promise": "^7.1.1",
"react": "16.0.0-alpha.6",
"sax": "^1.2.2",
"string-to-stream": "^1.1.0",
"uuid": "^3.0.1",
"xml2js": "^0.4.17"
},
"devDependencies": {
"babel-changed": "^7.0.0",
"babel-cli": "^6.24.1",
"babel-preset-env": "^1.5.1",
"babel-preset-react": "^6.24.1",
"query-string": "4.3.4"
},
"scripts": {
"build": "babel-changed app -d build",
"clean": "babel-changed --reset"
}
}

View File

@ -1,7 +1,7 @@
#!/bin/bash
CLIENT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
rm "$CLIENT_DIR/app/src"
rm -f "$CLIENT_DIR/app/src"
ln -s "$CLIENT_DIR/../ReactNativeClient/src" "$CLIENT_DIR/app"
npm run build && NODE_PATH="$CLIENT_DIR/build/" node build/import-enex.js

View File

@ -23,7 +23,8 @@
"file_exclude_patterns": [
"*.pro.user",
"*.pro.user.*",
"*.iml"
"*.iml",
"CliClient/app/src",
]
}
],