#!/usr/bin/env node const meName = 'generate-tlds-csv.js'; process.on('unhandledRejection', error => { console.error(meName + ": (FATAL)", error); process.exit(1); }); const request = require('async-request'); const cheerio = require('cheerio'); const countries = require('country-data').countries; const country = require('countryjs'); const stringify = require('csv-stringify'); const parse = require('csv-parse'); const fs = require('fs-extra'); const path = require('path'); const md5File = require('md5-file/promise'); const pathinfo = require('pathinfo'); const program = require('commander'); const tmp = require('tmp'); tmp.setGracefulCleanup(); const fileTldDescCsv = path.dirname(require.main.filename) + '/../../assets/tld-desc.csv'; const fileTldsCsv = path.dirname(require.main.filename) + '/../../tlds.csv'; const urlTldsAlpha = 'http://data.iana.org/TLD/tlds-alpha-by-domain.txt'; const urlDomainsDb = 'https://www.iana.org/domains/root/db'; program .option('-q, --quiet', 'Quiet Mode') .parse(process.argv); if (!program.quiet) { console.log(meName); console.log(" (c) 2017 Doug Bird, All Rights Reserved."); console.log(" see README.md for licensing and other information"); console.log(" https://github.com/katmore/tld-enum#readme"); console.log(""); console.log(" Generates the canonical 'tlds.csv' csv file by downloading resources from iana.org"); console.log(""); } (async() => { const tmpDir = tmp.dirSync({ unsafeCleanup: true }); process.stdout.write("downloading '" + urlTldsAlpha + "'..."); const responseTldsAlpha = await request(urlTldsAlpha); if (responseTldsAlpha.statusCode != 200) { console.log("error"); console.error(meName + ": (FATAL) response status code " + responseTldsAlpha.statusCode + " from URL '" + urlTldsAlpha + "'"); process.exit(1); return; } if (!responseTldsAlpha.body) { console.log("error"); console.error(meName + ": (FATAL) empty response body " + responseTldsAlpha.statusCode + " from URL '" + urlTldsAlpha + "'"); process.exit(1); return; } const fileTldsAlphaTxt = tmpDir.name + '/tlds-alpha-by-domain.txt'; const fileNewTldsCsv = tmpDir.name + '/tlds.csv'; fs.writeFileSync(fileTldsAlphaTxt, responseTldsAlpha.body, 'utf8'); fs.writeFileSync(fileNewTldsCsv, '', 'utf8'); console.log('success'); process.stdout.write("downloading '" + urlDomainsDb + "'..."); const responseDomainsDb = await request(urlDomainsDb); if (responseDomainsDb.statusCode != 200) { console.log("error"); console.error(meName + ": (FATAL) response status code " + responseDomainsDb.statusCode + " from URL '" + urlDomainsDb + "'"); process.exit(1); return; } if (!responseDomainsDb.body) { console.log("error"); console.error(meName + ": (FATAL) empty response body " + responseDomainsDb.statusCode + " from URL '" + urlDomainsDb + "'"); process.exit(1); return; } const htmlDomainsDb = responseDomainsDb.body; var $ = cheerio.load(htmlDomainsDb); console.log('success'); process.stdout.write("building country / TLD hashmap..."); let tld2CountryName = {}; let missingTld = []; countries.all.forEach((c) => { let tld = country.tld(c.alpha3, 'ISO3'); if (!tld) { missingTld.push(c.alpha3); return; } tld2CountryName[tld] = c.name; }); console.log('done'); //console.error('NOTICE: the following "countries" did not have an assigned top level domain: ' + missingTld.join(', ')); process.stdout.write("building description / TLD hashmap..."); let tld2Desc = {}; let parser = parse({ delimiter: ',' }); const csvPosMap = { domain: 0, description: 1, } parser.on('readable', function() { let tldData; while (tldData = parser.read()) { let tld = { domain: null, description: null, }; let prop; for (prop in tld) { if (typeof(tldData[csvPosMap[prop]]) !== 'undefined') { tld[prop] = tldData[csvPosMap[prop]]; } } if (tld.domain && tld.description) { tld2Desc[tld.domain] = tld.description; } } }); parser.write(fs.readFileSync(fileTldDescCsv)); parser.end(); console.log("done"); const tdPosMap = { domain: 0, type: 1, manager: 2, }; let tldSet = []; process.stdout.write("parsing IANA data..."); $('#tld-table').find('tr').each((i, element) => { let tld = { domain: null, type: null, manager: null, }; let tldData = []; // console.log('i ' + i); // console.log(element); $(element).find("td").each((iTd, elementTd) => { // console.log('iTd...'); // console.log(iTd); tldData.push($(elementTd).text()); }); for (var prop in tld) { if (typeof(tldData[tdPosMap[prop]]) !== 'undefined') { tld[prop] = tldData[tdPosMap[prop]]; } } if (!tld.domain) { return; } tld.domain = tld.domain.replace(/\s/g, '').replace(/\./g, ''); tldSet.push(tld); }); console.log('done'); const stringifier = stringify({ delimiter: ',' }); stringifier.on('readable', () => { let row; while (row = stringifier.read()) { fs.appendFileSync(fileNewTldsCsv, row, 'utf8') } }); process.stdout.write("serializing new 'tlds.csv'..."); for (var i = 0; i < tldSet.length; i++) { let tld = tldSet[i]; let csvRow = [tld.domain]; if ((tld.type == 'country-code') && (typeof(tld2CountryName[tld.domain]) !== 'undefined')) { csvRow.push(tld2CountryName[tld.domain]); } else { if (typeof(tld2Desc[tld.domain]) !== 'undefined') { csvRow.push(tld2Desc[tld.domain]); } else { csvRow.push(tld.manager); } } csvRow.push(tld.type); stringifier.write(csvRow); } stringifier.end(); console.log('done'); if (fs.existsSync(fileTldsCsv)) { const newMd5 = await md5File(fileNewTldsCsv); const csvMd5 = await md5File(fileTldsCsv); if (csvMd5 == newMd5) { console.error(meName + ": (NOTICE) ignoring newly generated 'tlds.csv' file that is identical to the existing file (md5: " + csvMd5 + ", path: " + fileTldsCsv + ")"); return; } const pathinfoTldsCsv = pathinfo(fileTldsCsv); const fileBackupTldsCsv = pathinfoTldsCsv.dirname + pathinfoTldsCsv.sep + pathinfoTldsCsv.basename + '-' + csvMd5 + '-backup.csv'; if (!fs.existsSync(fileBackupTldsCsv)) { fs.copySync(fileTldsCsv, fileBackupTldsCsv); } } process.stdout.write("saving new 'tlds.csv'..."); fs.copySync(fileNewTldsCsv, fileTldsCsv); console.log('done'); })();