Source: index.js

#!/usr/bin/env node

var path = require("path");
var fs = require("fs");
const getopts = require("getopts");
const superagent = require('superagent');
const unzipper = require("unzipper");
const tmp = require("tmp");
tmp.setGracefulCleanup();

const DateParser = require("citeproc").DateParser;
const zoteroToCsl = require('zotero-to-csl');
const zoteroToJurism = require('zotero2jurismcsl').convert;

/** @function getStyle
 * @description Instantiates the citation processor.
 * @returns {Object}
 */
const getStyle = require('./style').getStyle;

/** @namespace callbacks
 * @description Container carrying functions for performing
 *   a downstream sync into the target system.
 * @prop {} init - async function to set the value of initial parameters
 * @prop {} openTransaction - async function of open a database transaction
 * @prop {} closeTransaction - async function to close a database transaction
 * @prop {} items - async functions to add, delete, or modify item metadata
 * @prop {} attachments - async functions to add, delete, or modify attachment metadata
 * @prop {} files - async functions to add, purge, or check for the existence of files
 */
const callbacks = require('./callbacks').callbacks;

/** @function newUpdateSpec
 * @description Return a container for metadata necessary for downstream sync.
 * @prop {Object} items - parent-item metadata
 * @prop {Object} attachments - attachment metadata
 * @prop {Object} files - all files corresponding to downloaded attachment metadata files
 * @return {Object}
 */
const newUpdateSpec = () => {
    var template = {
        items: {
            add: [],
            mod: [],
            del: []
        },
        attachments: {
            add: [],
            mod: [],
            del: []
        },
        files: []
    }
    return JSON.parse(JSON.stringify(template))
}

/** @type Object[]
 * @description Hello.
 */
const CSL_DATE_VARIABLES = [
    "accessed",
    "available-date",
    "event-date",
    "issued",
    "original-date",
    "submitted"
];

function getConfig(opts, keyCacheJson) {
    if (opts.d) {
        var dataPath = opts.d;
    } else {
        var dataPath = process.cwd();
    }
    console.log(`Running with node version ${process.version}`);
    console.log("zsyncdown: using data directory " + dataPath);
    if (opts.i && fs.readdirSync(dataPath).length > 0) {
        abort("the -i option can only be used in an empty data directory");
    }
    var configFilePath = path.join(dataPath, "config.json");
    console.log("zsyncdown: opening config file at " + configFilePath);
    if (!fs.existsSync(configFilePath)) {
        if (opts.i) {
            fs.writeFileSync(configFilePath, "// Configuration file for citeproc-cite-service\n// For details, see README.md or https://www.npmjs.com/package/citeproc-cite-service\n" + JSON.stringify({
                dataPath: dataPath,
                dataMode: "CSL-M",
                cslMode: "CSL",
                access: {
                    groupID: false,
                    libraryKey: false
                }
            }, null, 2));
        } else {
            abort("config.json does not exist. Use the --init option.");
        }
    }
    var cfg = JSON.parse(fs.readFileSync(configFilePath)
                         .toString()
                         .split("\n").filter(function(line){if(line.slice(0,2)==="//"){return false}else{return line}})
                         .join("\n"));
    if (!cfg.dataMode || ["CSL", "CSL-M"].indexOf(cfg.dataMode) === -1) {
        abort("dataMode must be set to \"CSL\" or \"CSL-M\" in " + configFilePath);
    }
    if (cfg.dataMode === "CSL") {
        console.log("Using Zotero style");
        cfg.styleName = "chicago-fullnote-bibliography.csl";
    } else {
        console.log("Using Jurism style");
        cfg.styleName = "jm-chicago-fullnote-bibliography.csl";
    }

    cfg.fileExtFromKey = {};

    if (!fs.existsSync(cfg.dataPath)) {
        abort("data directory does not exist");
    }
    var keyCacheFile = path.join(cfg.dataPath, keyCacheJson);
    if (!fs.existsSync(keyCacheFile)) {
        fs.writeFileSync(keyCacheFile, JSON.stringify({library:0,items:{},attachments:{}}, null, 2));
    }
    console.log("zsyncdown: Using persistent cache file at " + keyCacheFile);
    
    return cfg;
}

function handleError(e) {
    console.log(e);
    process.exit();
}

function abort(msg) {
    console.log("zsyncdown ERROR: " + msg);
    process.exit();
}

var Runner = function(opts, callbacks) {
    this.emptyPdfInfo = [];
    this.keyCacheJson = "keyCache.json";
    this.cfg = getConfig(opts, this.keyCacheJson);
    this.cfg.opts = opts;
    if (!this.cfg.access || !this.cfg.access.groupID || !this.cfg.access.libraryKey) {
        abort("no access credentials found in config. See the README.");
    }
    this.callbacks = callbacks;
    this.oldVersions = JSON.parse(fs.readFileSync(path.join(this.cfg.dataPath, this.keyCacheJson)));
    this.style = getStyle(this.cfg);
};

Runner.prototype.callAPI = async function(uriStub, isAttachment, final) {
    var ret = null;
    var itemsUrl = "https://api.zotero.org/groups/" + this.cfg.access.groupID + uriStub;
    console.log("+ CALL " + itemsUrl);
    if (isAttachment) {
        ret = await superagent.get(itemsUrl)
            .buffer(true)
            .parse(superagent.parse.image)
            .set("Authorization", "Bearer " + this.cfg.access.libraryKey);
    } else {
        ret = await superagent.get(itemsUrl).set("content-type", "application/json").set("Authorization", "Bearer " + this.cfg.access.libraryKey);
    }
    if (ret.header["retry-after"]) {
        if (final) {
            throw "API call failed after retry: " + uriStub;
        } else {
            var retryAfter = parseInt(ret.header["retry-after"]);
            await sleep(retryAfter);
            ret = await this.callApi(uriStub, isAttachment, true);
        }
    }
    return ret;
}

Runner.prototype.getRealBufferAndExt = async function(buf) {
    var tmpInfo = tmp.dirSync();
    var filePath = path.join(tmpInfo.name, "myfile");
    await fs.writeFileSync(filePath, buf)
    var fileTyper = await import("file-type");
    var fileInfo = await fileTyper.fileTypeFromFile(filePath);
	if (fileInfo.mime == "application/zip") {
        await this.unzip(filePath, tmpInfo.name);
        for (var fn of fs.readdirSync(tmpInfo.name)) {
            if (fn === "myfile") continue;
            var newFilePath = path.join(tmpInfo.name, fn);
            fileInfo = await fileTyper.fileTypeFromFile(newFilePath);
            if (!fileInfo) {
                fileInfo = this.fixFileTypeTxtFail(fileInfo, fn);
            }
            buf = fs.readFileSync(newFilePath);
            break;
        }
    }
    return {
        fileInfo: fileInfo,
        buf: buf
    }
}

Runner.prototype.fixFileTypeTxtFail = function(fileInfo, fn) {
    if (!fileInfo) {
        fileInfo = {
            ext: "pdf",
            mime: "application/pdf"
        }
        if (fn.slice(-4) === ".txt") {
            fileInfo.ext = "txt",
            fileInfo.mime = "text/plain"
        }
    }
    return fileInfo;
}

Runner.prototype.unzip = async function(filePath, outputPath) {
    const zip = fs.createReadStream(filePath).pipe(unzipper.Parse({forceStream: true}));
    for await (const entry of zip) {
        var done = false;
        const fileName = entry.path;
        const type = entry.type; // 'Directory' or 'File'
        const size = entry.vars.uncompressedSize; // There is also compressedSize;
        if (!done) {
            entry.pipe(fs.createWriteStream(path.join(outputPath, fileName)));
        } else {
            entry.autodrain();
        }
    }
}

Runner.prototype.getVersions = async function(uriStub) {
    var ret = null;
    try {
        var ret = await this.callAPI(uriStub);
        var libraryVersion = ret.header["last-modified-version"];
        var keyVersions = JSON.parse(ret.text);
    } catch (e) {
        handleError(e);
    }
    return {
        library: libraryVersion,
        keys: keyVersions
    };
};

Runner.prototype.callVersions = async function() {
    var ret = {
        library: null,
        items: null,
        attachments: null
    };
    try {
        var versions = await this.getVersions("/items/top?itemType=-note&format=versions");
        ret.library = versions.library;
        ret.items = versions.keys;
        versions = await this.getVersions("/items?itemType=attachment&format=versions");
        ret.attachments = versions.keys;
    } catch (e) {
        handleError(e);
    }
    return ret;
};

Runner.prototype.updateVersionCache = function(libraryVersion) {
    this.oldVersions.library = libraryVersion;
    fs.writeFileSync(path.join(this.cfg.dataPath, this.keyCacheJson), JSON.stringify(this.oldVersions, null, 2));
};

Runner.prototype.getItems = async function(sublist) {
    var ret = null;
    try {
        var keys = sublist.join(",");
        var uriStub = "/items?itemKey=" + keys;
        ret = await this.callAPI(uriStub);
        ret = JSON.parse(ret.text);
        ret = ret.map(o => o.data);
    } catch (e) {
        handleError(e);
    }
    return ret;
}

Runner.prototype.versionDeltas = function(ret, oldVersions, newVersions) {
    for (var key in oldVersions) {
        if (!newVersions[key]) {
            ret.del.push(key);
        } else if (newVersions[key] > oldVersions[key]) {
            ret.mod.push(key);
        }
    }
    for (var key in newVersions) {
        if (!oldVersions[key]) {
            ret.add.push(key);
        }
    }
    return ret;
}

Runner.prototype.getUpdateSpec = async function(newVersions) {
    var ret = newUpdateSpec();
    ret.items = this.versionDeltas(ret.items, this.oldVersions.items, newVersions.items);
    ret.attachments = this.versionDeltas(ret.attachments, this.oldVersions.attachments, newVersions.attachments);
    return ret;
}

Runner.prototype.extractTag = function(arr, prefix, defaultValue) {
    var ret = defaultValue;
    try {
        if (prefix.slice(-1) !== ":") {
            throw new Error("Invalid prefix spec: must end in a colon (:)");
        }
        var offset = prefix.length;
        for (var i=arr.length-1;i>-1;i--) {
            var tag = arr[i].tag ? arr[i].tag : arr[i];
            if (tag.slice(0, offset) === prefix) {
                ret = tag.slice(offset); 
            }
            arr = arr.slice(0, i).concat(arr.slice(i+1));
        }
    } catch (e) {
        handleError(e);
    }
    return ret;
}

Runner.prototype.extractCountry = function(jurisdiction) {
    var ret = "";
    if (jurisdiction) {
        ret = jurisdiction.split(":")[0].toUpperCase();
    }
    return ret;
}

Runner.prototype.buildSiteItem = function(item) {
    try {
        var itemKey = item.key;
        var itemVersion = item.version;
        this.oldVersions.items[itemKey] = itemVersion;
        delete item.key;
        delete item.version;
        delete item.dateAdded;
        delete item.dateModified;
        var cslItemZotero = zoteroToCsl(item);
        // Okay. This is ugly. zoteroToCsl straight off npm doesn't
        // convert string dates to the CSL array form, so we hack in
        // a fix for those entries here.
        for (var fieldName of CSL_DATE_VARIABLES) {
            if ("string" === typeof cslItemZotero[fieldName]) {
                cslItemZotero[fieldName] = DateParser.parseDateToArray(cslItemZotero[fieldName]);
            }
        }
        
        // Okay. This is also ugly. zoteroToJurism() modifies the
        // object in place, as well as returning it as result.
        // If it is not recomposed here, cslItemZotero will be
        // unencoded as a side effect.
        var cslItemJurism = zoteroToJurism({data:item}, JSON.parse(JSON.stringify(cslItemZotero)));
        var cslItem = cslItemJurism;
        var cslJsonItem = cslItemZotero;
        var relatedItems = [];
        if (item.relations["dc:relation"]) {
            relations = item.relations["dc:relation"];
            if (typeof relations === "string") {
                relations = [relations];
            }
            relatedItems = relations.map(s => s.replace(/^.*\//, ""));
        }
        cslItemZotero.id = itemKey;
        cslItemJurism.id = itemKey;
        cslItem.id = itemKey;
        this.style.sys.items = JSON.parse("{\"" + itemKey + "\": " + JSON.stringify(cslItemJurism) + "}");
        this.style.updateItems([itemKey]);
        var country = this.extractCountry(cslItemJurism.jurisdiction);
    } catch (e) {
        handleError(e);
    }
    var citation = this.style.makeCitationCluster([{"id":itemKey}]);
    console.log(`citation: ${citation}`);
    if (relatedItems.length === 0) {
        delete cslJsonItem.id;
    }
    delete cslJsonItem["abstract"];
    if (cslJsonItem.note.match(/^mlzsync1:[0-9]{4}/)) {
        var extraDataStrLen = parseInt(cslJsonItem.note.slice(9, 13), 10);
        var extraDataStr = cslJsonItem.note.slice(13, 13 + extraDataStrLen);
        cslJsonItem.note = cslJsonItem.note.slice(13 + extraDataStrLen);
        var extraData = JSON.parse(extraDataStr);
        delete extraData.extrafields.callNumber;
        extraDataStr = JSON.stringify(extraData);
        extraDataStrLen = extraDataStr.length + "";
        while (extraDataStrLen.length < 4) {
            extraDataStrLen = "0" + extraDataStrLen;
        }
        cslJsonItem.note = `mlzsync1:${extraDataStrLen}${extraDataStr}${cslJsonItem.note}`;
    }
    return {
        key: itemKey,
        citation: citation,
        country: country,
        tags: item.tags,
        relatedItems: relatedItems,
        cslItem: cslItem,
        cslJsonItem: cslJsonItem
    }
}

Runner.prototype.buildSiteAttachment = function(attachment, fulltext){
    var language = this.extractTag(attachment.tags, "LN:", "en");
    var type = this.extractTag(attachment.tags, "TY:", false);
    var ocr = this.extractTag(attachment.tags, "OCR:", false);

    this.oldVersions.attachments[attachment.key] = attachment.version;
    if (this.cfg.opts.y) {
        if (attachment.filename === "empty.pdf") {
            this.emptyPdfInfo.push(attachment);
        }
    }
    var ret = {
        key: attachment.key,
        parentKey: attachment.parentItem,
        language: language,
        filename: attachment.title,
        fulltext: fulltext,
        note: attachment.note
    };
    if (type) {
        ret.type = type;
    }
    if (ocr) {
        ret.ocr = ocr;
    }
    return ret;

}

Runner.prototype.getFulltext = async function(itemKey) {
    var uriStub = "/items/" + itemKey +"/fulltext";
    try {
        res = await this.callAPI(uriStub);
        res = res.text;
        res = JSON.parse(res).content;
    } catch (e) {
        if (e.status == 404) {
            res = false;
        } else {
            handleError(e, itemKey);
        }
    }
    return res;
}

Runner.prototype.doDeletes = async function(updateSpec) {
    try {
        await this.callbacks.attachments.del.call(this.cfg, updateSpec.attachments.del);
        for (var attachmentKey of updateSpec.attachments.del) {
            delete this.oldVersions.attachments[attachmentKey];
        }
        await this.callbacks.items.del.call(this.cfg, updateSpec.items.del);
        for (var itemKey of updateSpec.items.del) {
            delete this.oldVersions.items[itemKey];
        }
    } catch (e) {
        handleError(e);
    }
}

Runner.prototype.doAddUpdateItems = async function(updateSpec) {
    var transactionSize = 50;
    console.log(`doAddUpdateItems`);
    try {
        var addSublists = [];
        while (updateSpec.items.add.length) {
            addSublists.push(updateSpec.items.add.slice(0, transactionSize));
            updateSpec.items.add = updateSpec.items.add.slice(transactionSize);
        }
        for (var sublist of addSublists) {
            var items = await this.getItems(sublist);
            for (var item of items) {
                var siteItem = this.buildSiteItem(item);
                await this.callbacks.items.add.call(this.cfg, siteItem);
            }
        }
        var modSublists = [];
        while (updateSpec.items.mod.length) {
            modSublists.push(updateSpec.items.mod.slice(0, transactionSize));
            updateSpec.items.mod = updateSpec.items.mod.slice(transactionSize);
        }
        for (var sublist of modSublists) {
            var items = await this.getItems(sublist);
            for (var item of items) {
                var siteItem = this.buildSiteItem(item);
                await this.callbacks.items.mod.call(this.cfg, siteItem);
            }
        }
    } catch (e) {
        handleError(e);
    }
}

Runner.prototype.doAddUpdateAttachments = async function(updateSpec) {
    var transactionSize = 25;
    //
    // FIX missing attachments. For each attachment in newVersions,
    // check if it's (a) missing from updateSpec.attachments, and
    // (b) missing from dir/files. If (a) & (b),
    // add its key to updateSpec.attachments.add.
    //
    console.log(`doAddUpdateAttachments`);
    var filesDir = path.join(this.cfg.dirs.topDir, "files");
    try {
        var attachmentNameFromKey = {};
        var addSublists = [];
        while (updateSpec.attachments.add.length) {
            addSublists.push(updateSpec.attachments.add.slice(0, transactionSize));
            updateSpec.attachments.add = updateSpec.attachments.add.slice(transactionSize);
        }
        for (var sublist of addSublists) {
            var attachments = await this.getItems(sublist);
            for (var attachment of attachments) {
                var fulltext = await this.getFulltext(attachment.key);
                if (!fulltext) {
                    // Triggers mod on next update if item still exists
                    this.oldVersions.attachments[attachment.key] = 0;
                }
                var siteAttachment = await this.buildSiteAttachment(attachment, fulltext);
		        attachmentNameFromKey[siteAttachment.key] = siteAttachment.filename;
                await this.callbacks.attachments.add.call(this.cfg, siteAttachment);
            }
        }
        var modSublists = [];
        while (updateSpec.attachments.mod.length) {
            modSublists.push(updateSpec.attachments.mod.slice(0, transactionSize));
            updateSpec.attachments.mod = updateSpec.attachments.mod.slice(transactionSize);
        }
        for (var sublist of modSublists) {
            var attachments = await this.getItems(sublist);
            for (var attachment of attachments) {
                var fulltext = await this.getFulltext(attachment.key);
                if (!fulltext) {
                    // Triggers mod on next update if item still exists
                    this.oldVersions.attachments[attachment.key] = 0;
                }
                var siteAttachment = await this.buildSiteAttachment(attachment, fulltext);
		        attachmentNameFromKey[siteAttachment.key] = siteAttachment.filename;
                await this.callbacks.attachments.mod.call(this.cfg, siteAttachment);
            }
        }
        var attachmentsDone = {};
        for (var attachmentID of updateSpec.attachments.mod) {
            // Download the file for a modified attachment ID unconditionally if the metadata has changed
            if (this.newVersions[attachmentID] > this.oldVersions[attachmentID]) {
                // true as second argument expects attachment file content
                var response = await this.callAPI("/items/" + attachmentID + "/file", true);
	            var info = await this.getRealBufferAndExt(response.body);
                await this.callbacks.files.add.call(this.cfg, attachmentID, info.buf, info.fileInfo.ext);
                attachmentsDone[attachmentID] = true;
            }
        };
        for (var attachmentID in this.newVersions.attachments) {
            if (attachmentsDone[attachmentID]) continue;
            var attachmentExists = await this.callbacks.files.exists.call(this.cfg, attachmentID);
            if (attachmentExists === false) {
                // true as second argument expects attachment file content
                var response = await this.callAPI("/items/" + attachmentID + "/file", true);
	            var info = await this.getRealBufferAndExt(response.body);
                await this.callbacks.files.add.call(this.cfg, attachmentID, info.buf, info.fileInfo.ext);
            };
        }
        await this.callbacks.files.purge.call(this.cfg, updateSpec.attachments.del);
    } catch (e) {
        handleError(e);
    }
}

Runner.prototype.run = async function() {
    try {
        await this.callbacks.init.call(this.cfg);
        var newVersions = await this.callVersions();
        this.newVersions = newVersions;

        var updateSpec = await this.getUpdateSpec(newVersions);

        // Open a DB transaction if required
        await this.callbacks.openTransaction.call(this.cfg);
        
        // Delete things for deletion, beginning with attachments
        await this.doDeletes(updateSpec);
        
        // Works in sets of 50
        await this.doAddUpdateItems(updateSpec);

        // Works in sets of 25
        await this.doAddUpdateAttachments(updateSpec);

        // Close a DB transaction if required
        await this.callbacks.closeTransaction.call(this.cfg);

        // Memo current library and item versions
        this.updateVersionCache(newVersions.library);

        // Finally, yeet placeholder PDFs if requested
        if (this.cfg.opts.y) {
            var filesDir = path.join(this.cfg.dirs.topDir, "files");
            for (var info of this.emptyPdfInfo) {
                var filePath = path.join(filesDir, `${info.key}.pdf`);
                if (fs.existsSync(filePath)) {
                    console.log(`removing empty placeholder PDF file: ${info.title} [${info.key}]`);
                    fs.unlinkSync(filePath);
                }
            }
        }
        console.log("Done!");
    } catch (e) {
        handleError(e);
    }
}

const optParams = {
    alias: {
        i: "init",
        d: "data-dir",
        y: "yeet-placeholder-pdfs",
        v: "version",
        h: "help"
    },
    string: ["d"],
    boolean: ["h", "i", "y", "v"],
    unknown: option => {
        abort("unknown option \"" +option + "\"");
    }
};

const usage = "Usage: " + path.basename(process.argv[1]) + " [options]\n"
      + "  -i, --init\n"
      + "    Initialize the current directory or --data-dir.\n"
      + "  -d, --data-dir <dataDirPath>\n"
      + "    Absolute path to a citeproc-cite-service data directory.\n"
      + "  -y, --yeet-placeholder-pdfs\n"
      + "    Skip placeholder PDF files included in the download\n"
      + "  -v, --version\n"
      + "    Write version number to terminal and exit\n";

const opts = getopts(process.argv.slice(2), optParams);


if (opts.v) {
  var pkg = JSON.parse(fs.readFileSync(path.join(__dirname, "package.json")).toString());
  console.log(pkg.version);
  process.exit();
}

if (opts.h) {
    console.log(usage);
    process.exit();
}

function isDir(pth) {
    var s = fs.statSync(opts.d);
    return s.isDirectory();
}
if (opts.d && (!path.isAbsolute(opts.d) || !fs.existsSync(opts.d) || !isDir(opts.d))) {
    abort("when used, option -d must be set to an existing absolute directory path");
}

var runner = new Runner(opts, callbacks);
runner.run();