diff --git a/README.md b/README.md index 7d3c218..1cbee05 100644 --- a/README.md +++ b/README.md @@ -38,5 +38,15 @@ occurency analysis to determine the most probable encoding. * ISO-8859-5 * ISO-8859-6 * ISO-8859-7 +* ISO-8859-8 +* ISO-8859-9 +* windows-1250 +* windows-1251 +* windows-1252 +* windows-1253 +* windows-1254 +* windows-1255 +* windows-1256 +* KOI8-R Currently only these encodings are supported, more will be added soon. \ No newline at end of file diff --git a/encoding/sbcs.js b/encoding/sbcs.js index a673763..f5ab964 100644 --- a/encoding/sbcs.js +++ b/encoding/sbcs.js @@ -595,34 +595,6 @@ module.exports.ISO_8859_7 = function() { }; util.inherits(module.exports.ISO_8859_7, sbcs); - -/* -module.exports.ISO_8859_7 = function() { - this.byteMap = function() { - return [ - - ]; - }; - - this.ngrams = function() { - return [ - - ]; - }; - - this.name = function(det) { - if (typeof det == 'undefined') - return "ISO-8859-7"; - return det.fC1Bytes ? "windows-1253" : "ISO-8859-7"; - }; - - this.language = function() { - return "el"; - }; -}; -util.inherits(module.exports.ISO_8859_7, sbcs); -*/ - module.exports.ISO_8859_8 = function() { this.byteMap = function() { @@ -748,7 +720,7 @@ module.exports.ISO_8859_9 = function() { 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062, 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, - 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD, + 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD ]; }; @@ -763,3 +735,220 @@ module.exports.ISO_8859_9 = function() { }; }; util.inherits(module.exports.ISO_8859_9, sbcs); + + +module.exports.windows_1251 = function() { + this.byteMap = function() { + return [ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, + 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F, + 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20, + 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF, + 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20, + 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, + 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF + ]; + }; + + this.ngrams = function() { + return [ + 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, + 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE, + 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, + 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED, + 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, + 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2, + 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, + 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520 + ]; + }; + + this.name = function(det) { + return "windows-1251"; + }; + + this.language = function() { + return "ru"; + }; +}; +util.inherits(module.exports.windows_1251, sbcs); + + +module.exports.windows_1256 = function() { + this.byteMap = function() { + return [ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20, + 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F, + 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, + 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, + 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20, + 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF + ]; + }; + + this.ngrams = function() { + return [ + 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, + 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8, + 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, + 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD, + 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, + 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20, + 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, + 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420 + ]; + }; + + this.name = function(det) { + return "windows-1256"; + }; + + this.language = function() { + return "ar"; + }; +}; +util.inherits(module.exports.windows_1256, sbcs); + + +module.exports.KOI8_R = function() { + this.byteMap = function() { + return [ + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, + 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, + 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, + 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, + 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF + ]; + }; + + this.ngrams = function() { + return [ + 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, + 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1, + 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, + 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE, + 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, + 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1, + 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, + 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF + ]; + }; + + this.name = function(det) { + return "KOI8-R"; + }; + + this.language = function() { + return "ru"; + }; +}; +util.inherits(module.exports.KOI8_R, sbcs); + + + + +/* +module.exports.ISO_8859_7 = function() { + this.byteMap = function() { + return [ + + ]; + }; + + this.ngrams = function() { + return [ + + ]; + }; + + this.name = function(det) { + if (typeof det == 'undefined') + return "ISO-8859-7"; + return det.fC1Bytes ? "windows-1253" : "ISO-8859-7"; + }; + + this.language = function() { + return "el"; + }; +}; +util.inherits(module.exports.ISO_8859_7, sbcs); +*/ + diff --git a/index.js b/index.js index 45ed415..671dcdd 100644 --- a/index.js +++ b/index.js @@ -29,13 +29,33 @@ var recognisers = [ new sbcs.ISO_8859_6, new sbcs.ISO_8859_7, new sbcs.ISO_8859_8, - new sbcs.ISO_8859_9 - + new sbcs.ISO_8859_9, + new sbcs.windows_1251, + new sbcs.windows_1256, + new sbcs.KOI8_R ]; module.exports.detect = function(buffer) { + // Tally up the byte occurence statistics. + var fByteStats = []; + for (var i = 0; i < 256; i++) + fByteStats[i] = 0; + + for (var i = buffer.length - 1; i >= 0; i--) + fByteStats[buffer[i] & 0x00ff]++; + + var fC1Bytes = false; + for (var i = 0x80; i <= 0x9F; i += 1) { + if (fByteStats[i] != 0) { + fC1Bytes = true; + break; + } + } + var context = { + fByteStats: fByteStats, + fC1Bytes: fC1Bytes, fRawInput: buffer, fRawLength: buffer.length, fInputBytes: buffer, @@ -52,7 +72,6 @@ module.exports.detect = function(buffer) { return a.confidence - b.confidence; }); - // console.log(matches); return matches.length ? matches.pop().name : null; }; diff --git a/test/data/encodings/iso88598 b/test/data/encodings/iso88598 new file mode 100644 index 0000000..3efd22e --- /dev/null +++ b/test/data/encodings/iso88598 @@ -0,0 +1,20 @@ +îä æä éåðé÷åã (Unicode)? + +éåðé÷åã î÷öä îñôø ééçåãé ìëì úå, +ìà îùðä òì àéæå ôìèôåøîä, +ìà îùðä áàéæå úåëðéú, +åìà îùðä áàéæå ùôä. + +áàåôï áñéñé, îçùáéí òåñ÷éí ø÷ áîñôøéí. äí îàçñðéí àåúéåú åúååéí àçøéí òì-éãé ä÷öàú îñôø ìëì àçã îäí. áèøí äåîöà äéåðé÷åã, äéå îàåú îòøëåú ÷éãåã ùåðåú ìä÷öàú äîñôøéí äììå. àó ìà àçú îäï éëìä ìäëéì ëîåú úååéí îñô÷ú. ìãåâîà: ø÷ ìàéçåã äàéøåôàé ðãøùéí ëîä ñåâé ÷éãåãéí ùåðéí òì îðú ìëñåú àú ëì äùôåú äîãåáøåú áå. éúéøä îæàú àó ìùôä áåããú, ëîå àðâìéú ìîùì, ìà äéä ãé áîòøëú ÷éãåã àçú áòáåø ëì äàåúéåú, ñéîðé äôéñå÷ åäñîìéí äèëðééí ùáùéîåù ùåèó. + +îòøëåú ÷éãåã àìå àó ñåúøåú æå àú æå. ëìåîø, ùðé ÷éãåãéí éëåìéí ìäùúîù áàåúå îñôø ìùðé úåéí ðáãìéí, àå ìäùúîù áîñôøéí ùåðéí ìàåúå úå. òì ëì îçùá (åáîéåçã ùøúéí) ìúîåê áîñôø øá ùì îòøëåú ÷éãåã ùåðåú; àåìí ëì àéîú ùðúåðéí òåáøéí áéï îòøëåú ÷éãåã àå ôìèôåøîåú ùåðåú ÷ééí äñéëåï ùééôâîå. +éåðé÷åã îùðä àú ëì æä! + +éåðé÷åã î÷öä îñôø ééçåãé ìëì úå, ììà úìåú áôìèôåøîä, áúåëðéú, àå áùôä. ú÷ï äéåðé÷åã àåîõ òì-éãé äîåáéìéí áúòùééä ëîå Appleþ, HPþ, IBMþ, JustSystemþ, Microsoftþ, Oracleþ, SAPþ, Sunþ, Sybaseþ, Unisysþ åøáéí àçøéí. éåðé÷åã ðãøù òì-éãé ú÷ðéí îåãøðééí ëîå XMLþ, Javaþ, ECMAScript (JavaScript)ýþ, LDAPþ, CORBA 3.0ýþ, WMLþ åëãåîä, åîäååä ìîòùä àú äééùåí äøùîé ùì ú÷ï ISO/IEC 10646. äåà ðúîê òì éãé îòøëåú äôòìä øáåú, ëì äãôãôðéí äçãéùéí, åîåöøéí øáéí àçøéí. äåôòú ú÷ï äéåðé÷åã åæîéðåú äëìéí äúåîëéí áå ðîðåú òí äîâîåú äëìì-òåìîéåú äçùåáåú áéåúø, àùø îñúîðåú ìàçøåðä áèëðåìåâééú äúåëðä. + +ùéìåá éåðé÷åã áééùåîé ùøú-ì÷åç àå áééùåîéí øáé-ùëáåú åáàúøé àéðèøðè îàôùø çéñëåï ðéëø áòìåéåú ìòåîú äùéîåù áñãøåú äúååéí äîñåøúéåú. äåãåú ìéåðé÷åã, îåöø úåëðä àçã àå àúø éçéã áøùú éëåì ìäøçéá àú éòãéå ìîâååï ôìèôåøîåú, àøöåú åùôåú ììà öåøê áùéðåééí îøçé÷éí. éåðé÷åã îàôùø îòáø ðúåðéí ãøê îòøëåú øáåú åùåðåú îáìé ùééôâîå. +ôøèéí àåãåú ä÷åðñåøöéåí ùì éåðé÷åã (Unicode Consortium) + +ä÷åðñåøöéåí ùì éåðé÷åã äåà àøâåï ììà îèøú øååç ùðåñã ëãé ìôúç, ìäøçéá åì÷ãí àú äùéîåù áú÷ï éåðé÷åã, àùø îâãéø àú ééöåâ äè÷ñè áîåöøé úåëðä åú÷ðéí îåãøðééí. çáøéí á÷åðñåøöéåí îâååï øçá ùì úàâéãéí åàøâåðéí áúòùééú äîçùáéí åòéáåã äîéãò. ä÷åðñåøöéåí îîåîï òì-éãé ãîé-çáø áìáã. äçáøåú á÷åðñåøöéåí éåðé÷åã ôúåçä ìàøâåðéí åìàðùéí ôøèééí, áëì øçáé äòåìí, àùø úåîëéí áú÷ï éåðé÷åã åîòåðééðéí ìñééò áäúôúçåúå åäèîòúå. + +ìîéãò ðåñó, øàä îéìåï îåðçéí, øùéîä çì÷éú ùì îåöøéí îåúàîéí ìéåðé÷åã, îáåà èëðé å- çåîøé òæø [÷éùåøéí áàðâìéú]. \ No newline at end of file diff --git a/test/data/encodings/iso88598_he b/test/data/encodings/iso88598_he new file mode 100644 index 0000000..3cbbecf --- /dev/null +++ b/test/data/encodings/iso88598_he @@ -0,0 +1,20 @@ +îä æä éåðé÷åã (Unicode)? + +éåðé÷åã î÷öä îñôø ééçåãé ìëì úå, +ìà îùðä òì àéæå ôìèôåøîä, +ìà îùðä áàéæå úåëðéú, +åìà îùðä áàéæå ùôä. + +áàåôï áñéñé, îçùáéí òåñ÷éí ø÷ áîñôøéí. äí îàçñðéí àåúéåú åúååéí àçøéí òì-éãé ä÷öàú îñôø ìëì àçã îäí. áèøí äåîöà äéåðé÷åã, äéå îàåú îòøëåú ÷éãåã ùåðåú ìä÷öàú äîñôøéí äììå. àó ìà àçú îäï éëìä ìäëéì ëîåú úååéí îñô÷ú. ìãåâîà: ø÷ ìàéçåã äàéøåôàé ðãøùéí ëîä ñåâé ÷éãåãéí ùåðéí òì îðú ìëñåú àú ëì äùôåú äîãåáøåú áå. éúéøä îæàú àó ìùôä áåããú, ëîå àðâìéú ìîùì, ìà äéä ãé áîòøëú ÷éãåã àçú áòáåø ëì äàåúéåú, ñéîðé äôéñå÷ åäñîìéí äèëðééí ùáùéîåù ùåèó. + +îòøëåú ÷éãåã àìå àó ñåúøåú æå àú æå. ëìåîø, ùðé ÷éãåãéí éëåìéí ìäùúîù áàåúå îñôø ìùðé úåéí ðáãìéí, àå ìäùúîù áîñôøéí ùåðéí ìàåúå úå. òì ëì îçùá (åáîéåçã ùøúéí) ìúîåê áîñôø øá ùì îòøëåú ÷éãåã ùåðåú; àåìí ëì àéîú ùðúåðéí òåáøéí áéï îòøëåú ÷éãåã àå ôìèôåøîåú ùåðåú ÷ééí äñéëåï ùééôâîå. +éåðé÷åã îùðä àú ëì æä! + +éåðé÷åã î÷öä îñôø ééçåãé ìëì úå, ììà úìåú áôìèôåøîä, áúåëðéú, àå áùôä. ú÷ï äéåðé÷åã àåîõ òì-éãé äîåáéìéí áúòùééä ëîå Appleþ, HPþ, IBMþ, JustSystemþ, Microsoftþ, Oracleþ, SAPþ, Sunþ, Sybaseþ, Unisysþ åøáéí àçøéí. éåðé÷åã ðãøù òì-éãé ú÷ðéí îåãøðééí ëîå XMLþ, Javaþ, ECMAScript (JavaScript)ýþ, LDAPþ, CORBA 3.0ýþ, WMLþ åëãåîä, åîäååä ìîòùä àú äééùåí äøùîé ùì ú÷ï ISO/IEC 10646. äåà ðúîê òì éãé îòøëåú äôòìä øáåú, ëì äãôãôðéí äçãéùéí, åîåöøéí øáéí àçøéí. äåôòú ú÷ï äéåðé÷åã åæîéðåú äëìéí äúåîëéí áå ðîðåú òí äîâîåú äëìì-òåìîéåú äçùåáåú áéåúø, àùø îñúîðåú ìàçøåðä áèëðåìåâééú äúåëðä. + +ùéìåá éåðé÷åã áééùåîé ùøú-ì÷åç àå áééùåîéí øáé-ùëáåú åáàúøé àéðèøðè îàôùø çéñëåï ðéëø áòìåéåú ìòåîú äùéîåù áñãøåú äúååéí äîñåøúéåú. äåãåú ìéåðé÷åã, îåöø úåëðä àçã àå àúø éçéã áøùú éëåì ìäøçéá àú éòãéå ìîâååï ôìèôåøîåú, àøöåú åùôåú ììà öåøê áùéðåééí îøçé÷éí. éåðé÷åã îàôùø îòáø ðúåðéí ãøê îòøëåú øáåú åùåðåú îáìé ùééôâîå. +ôøèéí àåãåú ä÷åðñåøöéåí ùì éåðé÷åã (Unicode Consortium) + +ä÷åðñåøöéåí ùì éåðé÷åã äåà àøâåï ììà îèøú øååç ùðåñã ëãé ìôúç, ìäøçéá åì÷ãí àú äùéîåù áú÷ï éåðé÷åã, àùø îâãéø àú ééöåâ äè÷ñè áîåöøé úåëðä åú÷ðéí îåãøðééí. çáøéí á÷åðñåøöéåí îâååï øçá ùì úàâéãéí åàøâåðéí áúòùééú äîçùáéí åòéáåã äîéãò. ä÷åðñåøöéåí îîåîï òì-éãé ãîé-çáø áìáã. äçáøåú á÷åðñåøöéåí éåðé÷åã ôúåçä ìàøâåðéí åìàðùéí ôøèééí, áëì øçáé äòåìí, àùø úåîëéí áú÷ï éåðé÷åã åîòåðééðéí ìñééò áäúôúçåúå åäèîòúå. + +ìîéãò ðåñó, øàä îéìåï îåðçéí, øùéîä çì÷éú ùì îåöøéí îåúàîéí ìéåðé÷åã, îáåà èëðé å- çåîøé òæø [÷éùåøéí áàðâìéú]. diff --git a/test/data/encodings/iso88599_tr b/test/data/encodings/iso88599_tr new file mode 100644 index 0000000..f9b3580 --- /dev/null +++ b/test/data/encodings/iso88599_tr @@ -0,0 +1,3 @@ +Leylek (Ciconia ciconia), leylekgiller (Ciconiidae) familyasýndan büyük ve uzun bacaklý bir kuþ türü. Siyah kanat uçuþ tüylerinin dýþýnda tamamen beyazdýr, gagasý ve bacaklarý eriþkinlerde kýrmýzý, yavrularda ise siyahtýr. Cüssesi biraz farklý olan iki alttürü ise Avrupa'da (kuzeyde Finlandiya'ya kadar), kuzeybatý Afrika'da ve güneybatý Asya'da (doðuda Kazakistan'ýn güneyine kadar) bulunur. Leylekler uzun mesafelere göç ederler. Çoðunlukla tropikal Sahraaltý Afrika'dan Güney Afrika'nýn güneyine ve hatta Hindistan altkýtasýnýn güneyine kadar olan bölgede kýþý geçirirler. Avrupa'dan Afrika'ya göç ederken Akdeniz üzerinden deðil, doðuda Levant üzerinden, batýda da Cebelitarýk Boðazý'ndan geçerler. Bunun nedeni uçmak için gereksinim duyduklarý hava termallerinin deniz üzerinde oluþmamasýdýr. Yerde yürürken durmadan, yavaþça hareket ederler. Leylekgiller ailesinin diðer üyeleri gibi boynu tamamen gerilmiþ þekilde uçarlar. +Etçil olan leylek, böcekler, balýk, amfibiler, sürüngenler, küçük memeliler ve küçük kuþlar gibi çok geniþ bir yelpazede beslenir. Besinlerinin çoðunu yerden, kýsa bitki örtüsü içinden ve sýð sulardan toplar. Tekeþli olarak ürerler ancak yaþam boyunca sürecek bir çift baðý kurmazlar. Hem erkeði hem de diþisi, çubuklardan oluþan ve birkaç yýl kullanýlabilen büyük bir yuva yapar. Diþi leylek her yýl bir kereliðine olmak üzere dört yumurta yumurtlar ve yavrular 33-34 gün sonra ayný anda olmamak üzere yumurtadan çýkar. Çifti oluþturan kuþlarýn ikisi de kuluçkaya yatar ve birlikte yavrularý beslerler. Yavrular yumurtadan çýktýktan 58-64 gün sonra yuvadan ayrýlýr ve 7 ila 20 gün daha ebeveynler tarafýndan beslenir. +Leylek, Dünya Doða ve Doðal Kaynaklarý Koruma Birliði (IUCN) tarafýndan asgari endiþe altýndaki türler arasýnda sýnýflandýrýlmýþtýr. Orta Çað boyunca ormanlarýn azalmasý leyleklerin yararýna olmuþtur ancak tarým pratiklerinin deðiþmesi ve sanayileþme 19. yüzyýlda ve 20. yüzyýlýn baþlarýnda Avrupa'nýn bazý bölgelerinde popülasyonlarýnýn azalmasýna ve hatta yok olmasýna neden olmuþtur. Avrupa çapýndaki koruma programlarýnýn sonucunda leyleklerin tekrar Hollanda, Belçika, Ýsviçre ve Ýsveç'te üremeleri saðlanmýþtýr. Doðal düþmanlarýnýn sayýsý azdýr ancak çeþitli parazitler taþýyabilir. Dikkat çekici bir tür olan leylek tarih boyunca bulunduðu bölgelerde çeþitli söylencelere konu olmuþtur. Bunlarýn en bilineni, bebeklerin leylekler tarafýndan getirildiði söylencesidir. \ No newline at end of file diff --git a/test/data/encodings/koi8r b/test/data/encodings/koi8r new file mode 100644 index 0000000..3b31481 --- /dev/null +++ b/test/data/encodings/koi8r @@ -0,0 +1 @@ +ðÅÒ×ÏÍÁÊ × ÓÏ×ÒÅÍÅÎÎÏÍ ×ÉÄÅ ×ÏÚÎÉË × ËÏÎÃÅ XIX ×ÅËÁ × ÒÁÂÏÞÅÍ Ä×ÉÖÅÎÉÉ, ×ÙÄ×ÉÎÕ×ÛÅÍ × ËÁÞÅÓÔ×Å ÏÄÎÏÇÏ ÉÚ ÏÓÎÏ×ÎÙÈ ÔÒÅÂÏ×ÁÎÉÊ ××ÅÄÅÎÉÅ ×ÏÓØÍÉÞÁÓÏ×ÏÇÏ ÒÁÂÏÞÅÇÏ ÄÎÑ. 1 ÍÁÑ 1886 ÇÏÄÁ ÓÏÃÉÁÌÉÓÔÉÞÅÓËÉÅ, ËÏÍÍÕÎÉÓÔÉÞÅÓËÉÅ É ÁÎÁÒÈÉÞÅÓËÉÅ ÏÒÇÁÎÉÚÁÃÉÉ óûá É ëÁÎÁÄÙ ÕÓÔÒÏÉÌÉ ÒÑÄ ÍÉÔÉÎÇÏ× É ÄÅÍÏÎÓÔÒÁÃÉÊ. ðÒÉ ÒÁÚÇÏÎÅ ÔÁËÏÊ ÄÅÍÏÎÓÔÒÁÃÉÉ × þÉËÁÇÏ 4 ÍÁÑ ÐÏÇÉÂÌÏ ÛÅÓÔØ ÄÅÍÏÎÓÔÒÁÎÔÏ×. ÷ ÈÏÄÅ ÐÏÓÌÅÄÏ×Á×ÛÉÈ ÚÁ ÜÔÉÍ ÍÁÓÓÏ×ÙÈ ×ÙÓÔÕÐÌÅÎÉÊ ÐÒÏÔÅÓÔÁ ÐÒÏÔÉ× ÖÅÓÔÏËÉÈ ÄÅÊÓÔ×ÉÊ ÐÏÌÉÃÉÉ × ÒÅÚÕÌØÔÁÔÅ ×ÚÒÙ×Á ÂÏÍÂÙ ÐÏÓÌÅÄÏ×Á×ÛÅÊ ÐÅÒÅÓÔÒÅÌËÅ ÂÙÌÏ ÕÂÉÔÏ ×ÏÓÅÍØ ÐÏÌÉÃÅÊÓËÉÈ É ÍÉÎÉÍÕÍ ÞÅÔ×ÅÒÏ ÒÁÂÏÞÉÈ (ÐÏ ÎÅËÏÔÏÒÙÍ ÄÁÎÎÙÍ, ÄÏ ÐÑÔÉÄÅÓÑÔÉ ÕÂÉÔÙÈ É ÒÁÎÅÎÙÈ[2]), ÎÅÓËÏÌØËÏ ÄÅÓÑÔËÏ× ÞÅÌÏ×ÅË ÐÏÌÕÞÉÌÉ ÒÁÎÅÎÉÑ. ðÏ ÏÂ×ÉÎÅÎÉÀ × ÏÒÇÁÎÉÚÁÃÉÉ ×ÚÒÙ×Á ÞÅÔ×ÅÒÏ ÒÁÂÏÞÉÈ-ÁÎÁÒÈÉÓÔÏ× ÂÙÌÉ ÐÒÉÇÏ×ÏÒÅÎÙ Ë ÐÏ×ÅÛÅÎÉÀ (×ÐÏÓÌÅÄÓÔ×ÉÉ ÂÙÌÏ ÄÏËÁÚÁÎÏ, ÞÔÏ ÏÂ×ÉÎÅÎÉÅ ÂÙÌÏ ÌÏÖÎÙÍ)[3]. éÍÅÎÎÏ × ÐÁÍÑÔØ Ï ËÁÚΣÎÎÙÈ ðÁÒÉÖÓËÉÊ ËÏÎÇÒÅÓÓ II éÎÔÅÒÎÁÃÉÏÎÁÌÁ (ÉÀÌØ 1889) ÏÂßÑ×ÉÌ 1 ÍÁÑ äÎ£Í ÓÏÌÉÄÁÒÎÏÓÔÉ ÒÁÂÏÞÉÈ ×ÓÅÇÏ ÍÉÒÁ É ÐÒÅÄÌÏÖÉÌ ÅÖÅÇÏÄÎÏ ÏÔÍÅÞÁÔØ ÅÇÏ ÄÅÍÏÎÓÔÒÁÃÉÑÍÉ Ó ÓÏÃÉÁÌØÎÙÍÉ ÔÒÅÂÏ×ÁÎÉÑÍÉ. \ No newline at end of file diff --git a/test/data/encodings/lang_hebrew b/test/data/encodings/lang_hebrew new file mode 100644 index 0000000..c503ff1 --- /dev/null +++ b/test/data/encodings/lang_hebrew @@ -0,0 +1,20 @@ +מה ×–×” יוניקוד (Unicode)? + +יוניקוד מקצה מספר ייחודי לכל תו, +×œ× ×ž×©× ×” על ×יזו פלטפורמה, +×œ× ×ž×©× ×” ב×יזו תוכנית, +×•×œ× ×ž×©× ×” ב×יזו שפה. + +ב×ופן בסיסי, ×ž×—×©×‘×™× ×¢×•×¡×§×™× ×¨×§ במספרי×. ×”× ×ž××—×¡× ×™× ×ותיות ×•×ª×•×•×™× ××—×¨×™× ×¢×œ-ידי הקצ×ת מספר לכל ×חד מה×. ×‘×˜×¨× ×”×•×ž×¦× ×”×™×•× ×™×§×•×“, היו מ×ות מערכות קידוד שונות להקצ×ת ×”×ž×¡×¤×¨×™× ×”×œ×œ×•. ××£ ×œ× ×חת מהן יכלה להכיל כמות ×ª×•×•×™× ×ž×¡×¤×§×ª. לדוגמ×: רק ל×יחוד ×”×ירופ××™ × ×“×¨×©×™× ×›×ž×” סוגי ×§×™×“×•×“×™× ×©×•× ×™× ×¢×œ מנת לכסות ×ת כל השפות המדוברות בו. יתירה מז×ת ××£ לשפה בודדת, כמו ×נגלית למשל, ×œ× ×”×™×” די במערכת קידוד ×חת בעבור כל ×”×ותיות, סימני הפיסוק ×•×”×¡×ž×œ×™× ×”×˜×›× ×™×™× ×©×‘×©×™×ž×•×© שוטף. + +מערכות קידוד ×לו ××£ סותרות זו ×ת זו. כלומר, שני ×§×™×“×•×“×™× ×™×›×•×œ×™× ×œ×”×©×ª×ž×© ב×ותו מספר לשני ×ª×•×™× × ×‘×“×œ×™×, ×ו להשתמש ×‘×ž×¡×¤×¨×™× ×©×•× ×™× ×œ×ותו תו. על כל מחשב (ובמיוחד שרתי×) לתמוך במספר רב של מערכות קידוד שונות; ××•×œ× ×›×œ ×ימת ×©× ×ª×•× ×™× ×¢×•×‘×¨×™× ×‘×™×Ÿ מערכות קידוד ×ו פלטפורמות שונות ×§×™×™× ×”×¡×™×›×•×Ÿ שייפגמו. +יוניקוד משנה ×ת כל ×–×”! + +יוניקוד מקצה מספר ייחודי לכל תו, ×œ×œ× ×ª×œ×•×ª בפלטפורמה, בתוכנית, ×ו בשפה. תקן היוניקוד ×ומץ על-ידי ×”×ž×•×‘×™×œ×™× ×‘×ª×¢×©×™×™×” כמו Appleâ€, HPâ€, IBMâ€, JustSystemâ€, Microsoftâ€, Oracleâ€, SAPâ€, Sunâ€, Sybaseâ€, Unisysâ€ ×•×¨×‘×™× ×חרי×. יוניקוד נדרש על-ידי ×ª×§× ×™× ×ž×•×“×¨× ×™×™× ×›×ž×• XMLâ€, Javaâ€, ECMAScript (JavaScript)‎â€, LDAPâ€, CORBA 3.0‎â€, WML†וכדומה, ומהווה למעשה ×ת ×”×™×™×©×•× ×”×¨×©×ž×™ של תקן ISO/IEC 10646. ×”×•× × ×ª×ž×š על ידי מערכות הפעלה רבות, כל ×”×“×¤×“×¤× ×™× ×”×—×“×™×©×™×, ×•×ž×•×¦×¨×™× ×¨×‘×™× ×חרי×. הופעת תקן היוניקוד וזמינות ×”×›×œ×™× ×”×ª×•×ž×›×™× ×‘×• נמנות ×¢× ×”×ž×’×ž×•×ª הכלל-עולמיות החשובות ביותר, ×שר מסתמנות ל×חרונה בטכנולוגיית התוכנה. + +שילוב יוניקוד ביישומי שרת-לקוח ×ו ×‘×™×™×©×•×ž×™× ×¨×‘×™-שכבות וב×תרי ×ינטרנט מ×פשר חיסכון ניכר בעלויות לעומת השימוש בסדרות ×”×ª×•×•×™× ×”×ž×¡×•×¨×ª×™×•×ª. הודות ליוניקוד, מוצר תוכנה ×חד ×ו ×תר יחיד ברשת יכול להרחיב ×ת יעדיו למגוון פלטפורמות, ×רצות ושפות ×œ×œ× ×¦×•×¨×š ×‘×©×™× ×•×™×™× ×ž×¨×—×™×§×™×. יוניקוד מ×פשר מעבר × ×ª×•× ×™× ×“×¨×š מערכות רבות ושונות מבלי שייפגמו. +×¤×¨×˜×™× ×ודות ×”×§×•× ×¡×•×¨×¦×™×•× ×©×œ יוניקוד (Unicode Consortium) + +×”×§×•× ×¡×•×¨×¦×™×•× ×©×œ יוניקוד ×”×•× ×רגון ×œ×œ× ×ž×˜×¨×ª רווח שנוסד כדי לפתח, להרחיב ×•×œ×§×“× ×ת השימוש בתקן יוניקוד, ×שר מגדיר ×ת ייצוג הטקסט במוצרי תוכנה ×•×ª×§× ×™× ×ž×•×“×¨× ×™×™×. ×—×‘×¨×™× ×‘×§×•× ×¡×•×¨×¦×™×•× ×ž×’×•×•×Ÿ רחב של ת××’×™×“×™× ×•××¨×’×•× ×™× ×‘×ª×¢×©×™×™×ª ×”×ž×—×©×‘×™× ×•×¢×™×‘×•×“ המידע. ×”×§×•× ×¡×•×¨×¦×™×•× ×ž×ž×•×ž×Ÿ על-ידי דמי-חבר בלבד. החברות ×‘×§×•× ×¡×•×¨×¦×™×•× ×™×•× ×™×§×•×“ פתוחה ל××¨×’×•× ×™× ×•×œ×× ×©×™× ×¤×¨×˜×™×™×, בכל רחבי העול×, ×שר ×ª×•×ž×›×™× ×‘×ª×§×Ÿ יוניקוד ×•×ž×¢×•× ×™×™× ×™× ×œ×¡×™×™×¢ בהתפתחותו והטמעתו. + +למידע נוסף, ר××” מילון מונחי×, רשימה חלקית של ×ž×•×¦×¨×™× ×ž×•×ª××ž×™× ×œ×™×•× ×™×§×•×“, ×ž×‘×•× ×˜×›× ×™ ו- חומרי עזר [×§×™×©×•×¨×™× ×‘×נגלית]. \ No newline at end of file diff --git a/test/data/encodings/lang_turkish b/test/data/encodings/lang_turkish new file mode 100644 index 0000000..ca46409 --- /dev/null +++ b/test/data/encodings/lang_turkish @@ -0,0 +1,4 @@ +Leylek (Ciconia ciconia), leylekgiller (Ciconiidae) familyasından büyük ve uzun bacaklı bir kuÅŸ türü. Siyah kanat uçuÅŸ tüylerinin dışında tamamen beyazdır, gagası ve bacakları eriÅŸkinlerde kırmızı, yavrularda ise siyahtır. Cüssesi biraz farklı olan iki alttürü ise Avrupa'da (kuzeyde Finlandiya'ya kadar), kuzeybatı Afrika'da ve güneybatı Asya'da (doÄŸuda Kazakistan'ın güneyine kadar) bulunur. Leylekler uzun mesafelere göç ederler. ÇoÄŸunlukla tropikal Sahraaltı Afrika'dan Güney Afrika'nın güneyine ve hatta Hindistan altkıtasının güneyine kadar olan bölgede kışı geçirirler. Avrupa'dan Afrika'ya göç ederken Akdeniz üzerinden deÄŸil, doÄŸuda Levant üzerinden, batıda da Cebelitarık BoÄŸazı'ndan geçerler. Bunun nedeni uçmak için gereksinim duydukları hava termallerinin deniz üzerinde oluÅŸmamasıdır. Yerde yürürken durmadan, yavaşça hareket ederler. Leylekgiller ailesinin diÄŸer üyeleri gibi boynu tamamen gerilmiÅŸ ÅŸekilde uçarlar. +Etçil olan leylek, böcekler, balık, amfibiler, sürüngenler, küçük memeliler ve küçük kuÅŸlar gibi çok geniÅŸ bir yelpazede beslenir. Besinlerinin çoÄŸunu yerden, kısa bitki örtüsü içinden ve sığ sulardan toplar. TekeÅŸli olarak ürerler ancak yaÅŸam boyunca sürecek bir çift bağı kurmazlar. Hem erkeÄŸi hem de diÅŸisi, çubuklardan oluÅŸan ve birkaç yıl kullanılabilen büyük bir yuva yapar. DiÅŸi leylek her yıl bir kereliÄŸine olmak üzere dört yumurta yumurtlar ve yavrular 33-34 gün sonra aynı anda olmamak üzere yumurtadan çıkar. Çifti oluÅŸturan kuÅŸların ikisi de kuluçkaya yatar ve birlikte yavruları beslerler. Yavrular yumurtadan çıktıktan 58-64 gün sonra yuvadan ayrılır ve 7 ila 20 gün daha ebeveynler tarafından beslenir. +Leylek, Dünya DoÄŸa ve DoÄŸal Kaynakları Koruma BirliÄŸi (IUCN) tarafından asgari endiÅŸe altındaki türler arasında sınıflandırılmıştır. Orta ÇaÄŸ boyunca ormanların azalması leyleklerin yararına olmuÅŸtur ancak tarım pratiklerinin deÄŸiÅŸmesi ve sanayileÅŸme 19. yüzyılda ve 20. yüzyılın baÅŸlarında Avrupa'nın bazı bölgelerinde popülasyonlarının azalmasına ve hatta yok olmasına neden olmuÅŸtur. Avrupa çapındaki koruma programlarının sonucunda leyleklerin tekrar Hollanda, Belçika, Ä°sviçre ve Ä°sveç'te üremeleri saÄŸlanmıştır. DoÄŸal düşmanlarının sayısı azdır ancak çeÅŸitli parazitler taşıyabilir. Dikkat çekici bir tür olan leylek tarih boyunca bulunduÄŸu bölgelerde çeÅŸitli söylencelere konu olmuÅŸtur. Bunların en bilineni, bebeklerin leylekler tarafından getirildiÄŸi söylencesidir. +€ ‚ Æ’ „ … † ‡ Å  \ No newline at end of file diff --git a/test/data/encodings/windows_1250 b/test/data/encodings/windows_1250 new file mode 100644 index 0000000..8f78952 --- /dev/null +++ b/test/data/encodings/windows_1250 @@ -0,0 +1,5 @@ +Velký a Malý Tisý je národní pøírodní rezervace ev. è. 498 poblíž mìsta Lomnice nad Lužnicí v okrese Jindøichùv Hradec ležící na území CHKO Tøeboòsko. Øadí se mezi nejvýznamnìjší rybnièní rezervace v Èesku a je významná rozsáhlým litorálním porostem na bøezích rybníkù. Oblast spravuje AOPK ÈR Správa CHKO Tøeboòsko a je evidována i v rámci svìtové organizace UNESCO jako biosférická rezervace, Natura 2000 a další. Dùvodem ochrany je jedna z nejvýznamnìjších ornitologických rezervací v Èesku. Význam má i z pohledu entomologického. +Souèástí rezervace je 11 vìtších rybníkù, mimo jiné i dvojice rybníkù Velký a Malý Tisý, které daly lokalitì název. Pro rybníky v rezervaci je charakteristické, že mají velmi èlenité pobøeží tvoøené zarostlými bøehy, zátokami, poloostrovy a ostrùvky. Na bøehy volnì navazují podmáèené louky, lesy, vøesovištì a pole. Vlivem rozmanitosti rùzných stanoviš se zde nachází bohatá øada druhù z flory i fauny, které zde sídlí. Hlavnì ptactvo využívá lokalitu jako dùležitou migraèní zastávku èi shromaždištì pøed pravidelnými tahy. +I pøes to, že je lokalita po desetiletí chránìna, došlo nevhodnými hospodáøskými zásahy v podobì nadmìrného chovu ryb od 50. let 20. století k postupné degradaci a ústupu litorálních porostù. Od 90. let 20. století se ochranáøi snaží snižováním poètu nasazovaných ryb a zmìnou jejich druhové skladby spoleènì s vodohospodáøskými zásahy do výšky vodní hladiny rybníku Velký Tisý podpoøit rozvoj rákosových porostù. Výsledky tìchto opatøení ukázaly, že na obnovu porostù by i za vhodných podmínek byla potøeba doba dosahující až desítek let. + +€ ‚ „ … † ‡ \ No newline at end of file diff --git a/test/data/encodings/windows_1251 b/test/data/encodings/windows_1251 new file mode 100644 index 0000000..bd00c64 --- /dev/null +++ b/test/data/encodings/windows_1251 @@ -0,0 +1 @@ +Ïåðâîìàé â ñîâðåìåííîì âèäå âîçíèê â êîíöå XIX âåêà â ðàáî÷åì äâèæåíèè, âûäâèíóâøåì â êà÷åñòâå îäíîãî èç îñíîâíûõ òðåáîâàíèé ââåäåíèå âîñüìè÷àñîâîãî ðàáî÷åãî äíÿ. 1 ìàÿ 1886 ãîäà ñîöèàëèñòè÷åñêèå, êîììóíèñòè÷åñêèå è àíàðõè÷åñêèå îðãàíèçàöèè ÑØÀ è Êàíàäû óñòðîèëè ðÿä ìèòèíãîâ è äåìîíñòðàöèé. Ïðè ðàçãîíå òàêîé äåìîíñòðàöèè â ×èêàãî 4 ìàÿ ïîãèáëî øåñòü äåìîíñòðàíòîâ.  õîäå ïîñëåäîâàâøèõ çà ýòèì ìàññîâûõ âûñòóïëåíèé ïðîòåñòà ïðîòèâ æåñòîêèõ äåéñòâèé ïîëèöèè â ðåçóëüòàòå âçðûâà áîìáû ïîñëåäîâàâøåé ïåðåñòðåëêå áûëî óáèòî âîñåìü ïîëèöåéñêèõ è ìèíèìóì ÷åòâåðî ðàáî÷èõ (ïî íåêîòîðûì äàííûì, äî ïÿòèäåñÿòè óáèòûõ è ðàíåíûõ[2]), íåñêîëüêî äåñÿòêîâ ÷åëîâåê ïîëó÷èëè ðàíåíèÿ. Ïî îáâèíåíèþ â îðãàíèçàöèè âçðûâà ÷åòâåðî ðàáî÷èõ-àíàðõèñòîâ áûëè ïðèãîâîðåíû ê ïîâåøåíèþ (âïîñëåäñòâèè áûëî äîêàçàíî, ÷òî îáâèíåíèå áûëî ëîæíûì)[3]. Èìåííî â ïàìÿòü î êàçí¸ííûõ Ïàðèæñêèé êîíãðåññ II Èíòåðíàöèîíàëà (èþëü 1889) îáúÿâèë 1 ìàÿ Äí¸ì ñîëèäàðíîñòè ðàáî÷èõ âñåãî ìèðà è ïðåäëîæèë åæåãîäíî îòìå÷àòü åãî äåìîíñòðàöèÿìè ñ ñîöèàëüíûìè òðåáîâàíèÿìè. \ No newline at end of file diff --git a/test/data/encodings/windows_1252 b/test/data/encodings/windows_1252 new file mode 100644 index 0000000..4faae5d --- /dev/null +++ b/test/data/encodings/windows_1252 @@ -0,0 +1,22 @@ +What is Unicode? + +Unicode provides a unique number for every character, +no matter what the platform, +no matter what the program, +no matter what the language. + +Fundamentally, computers just deal with numbers. They store letters and other characters by assigning a number for each one. Before Unicode was invented, there were hundreds of different encoding systems for assigning these numbers. No single encoding could contain enough characters: for example, the European Union alone requires several different encodings to cover all its languages. Even for a single language like English no single encoding was adequate for all the letters, punctuation, and technical symbols in common use. + +These encoding systems also conflict with one another. That is, two encodings can use the same number for two different characters, or use different numbers for the same character. Any given computer (especially servers) needs to support many different encodings; yet whenever data is passed between different encodings or platforms, that data always runs the risk of corruption. +Unicode is changing all that! + +Unicode provides a unique number for every character, no matter what the platform, no matter what the program, no matter what the language. The Unicode Standard has been adopted by such industry leaders as Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, Sun, Sybase, Unisys and many others. Unicode is required by modern standards such as XML, Java, ECMAScript (JavaScript), LDAP, CORBA 3.0, WML, etc., and is the official way to implement ISO/IEC 10646. It is supported in many operating systems, all modern browsers, and many other products. The emergence of the Unicode Standard, and the availability of tools supporting it, are among the most significant recent global software technology trends. + +Incorporating Unicode into client-server or multi-tiered applications and websites offers significant cost savings over the use of legacy character sets. Unicode enables a single software product or a single website to be targeted across multiple platforms, languages and countries without re-engineering. It allows data to be transported through many different systems without corruption. +About the Unicode Consortium + +The Unicode Consortium is a non-profit organization founded to develop, extend and promote use of the Unicode Standard, which specifies the representation of text in modern software products and standards. The membership of the consortium represents a broad spectrum of corporations and organizations in the computer and information processing industry. The consortium is supported financially solely through membership dues. Membership in the Unicode Consortium is open to organizations and individuals anywhere in the world who support the Unicode Standard and wish to assist in its extension and implementation. + +For more information, see the Glossary, Unicode Enabled Products, Technical Introduction and Useful Resources. + +€ ‚ ƒ „ … † ‡ Š \ No newline at end of file diff --git a/test/data/encodings/windows_1253 b/test/data/encodings/windows_1253 new file mode 100644 index 0000000..dcf1a89 --- /dev/null +++ b/test/data/encodings/windows_1253 @@ -0,0 +1,3 @@ +Ç ËáúêÞ Þ Äçìþäçò ËáôéíéêÞ (ëáô. sermo vulgaris) åßíáé Ýíáò üñïò-ïìðñÝëá, ï ïðïßïò êáëýðôåé ôéò äéáëÝêôïõò ôÞò ëáôéíéêÞò ãëþóóáò ðïõ ïìéëïýíôáí êõñßùò óôéò äõôéêÝò åðáñ÷ßåò ôÞò ÑùìáúêÞò Áõôïêñáôïñßáò, ìÝ÷ñéò üôïõ áõôÝò ïé äéÜëåêôïé, áðïêëßíïíôáò áêüìç ðåñéóóüôåñï, åîåëß÷èçêáí óôéò ðñþéìåò ñïìáíéêÝò ãëþóóåò êáôÜ ôïí 9ï áéþíá. +Ç ïìéëïõìÝíç ËáôéíéêÞ äéÝöåñå áðü ôç ëïãïôå÷íéêÞ êëáóéêÞ ËáôéíéêÞ óôçí ðñïöïñÜ, ôï ëåîéëüãéï êáé ôç ãñáììáôéêÞ. ÊÜðïéá ÷áñáêôçñéóôéêÜ ôçò äçìþäïõò ËáôéíéêÞò äåí åìöáíßóôçêáí ðáñÜ óôçí ýóôåñç Áõôïêñáôïñßá. ¢ëëá ÷áñáêôçñéóôéêÜ ôçò õðÞñ÷áí ðéèáíüí óôçí ïìéëïõìÝíç ËáôéíéêÞ, ôïõëÜ÷éóôïí óôéò ðñùôïãåíåßò ìïñöÝò ôïõò, ðïëý íùñßôåñá. Ïé ðåñéóóüôåñïé ïñéóìïß ôÞò äçìþäïõò ËáôéíéêÞò ôçí ðáñïõóéÜæïõí ùò ðñïöïñéêÞ ðáñÜ ùò ãñáðôÞ ãëþóóá, åðåéäÞ ïé ìáñôõñßåò ïäçãïýí óôï óõìðÝñáóìá üôé ç ïìéëïõìÝíç ËáôéíéêÞ äéáóðÜóôçêå óå áðïêëßíïõóåò äéáëÝêôïõò áõôÞ ôçí ðåñßïäï. ÅðåéäÞ êáíåßò ôüôå äåí ìåôÝãñáøå öùíçôéêÜ ôçí êáèçìåñéíÞ ïìéëßá ôùí Ëáôßíùí, ïé ìåëåôçôÝò ôÞò ëáúêÞò ËáôéíéêÞò ðñÝðåé íá ÷ñçóéìïðïéïýí Ýììåóåò ìåèüäïõò. +€ ‚ ƒ „ … † ‡ \ No newline at end of file diff --git a/test/data/encodings/windows_1254 b/test/data/encodings/windows_1254 new file mode 100644 index 0000000..d12a389 --- /dev/null +++ b/test/data/encodings/windows_1254 @@ -0,0 +1,4 @@ +Leylek (Ciconia ciconia), leylekgiller (Ciconiidae) familyasýndan büyük ve uzun bacaklý bir kuþ türü. Siyah kanat uçuþ tüylerinin dýþýnda tamamen beyazdýr, gagasý ve bacaklarý eriþkinlerde kýrmýzý, yavrularda ise siyahtýr. Cüssesi biraz farklý olan iki alttürü ise Avrupa'da (kuzeyde Finlandiya'ya kadar), kuzeybatý Afrika'da ve güneybatý Asya'da (doðuda Kazakistan'ýn güneyine kadar) bulunur. Leylekler uzun mesafelere göç ederler. Çoðunlukla tropikal Sahraaltý Afrika'dan Güney Afrika'nýn güneyine ve hatta Hindistan altkýtasýnýn güneyine kadar olan bölgede kýþý geçirirler. Avrupa'dan Afrika'ya göç ederken Akdeniz üzerinden deðil, doðuda Levant üzerinden, batýda da Cebelitarýk Boðazý'ndan geçerler. Bunun nedeni uçmak için gereksinim duyduklarý hava termallerinin deniz üzerinde oluþmamasýdýr. Yerde yürürken durmadan, yavaþça hareket ederler. Leylekgiller ailesinin diðer üyeleri gibi boynu tamamen gerilmiþ þekilde uçarlar. +Etçil olan leylek, böcekler, balýk, amfibiler, sürüngenler, küçük memeliler ve küçük kuþlar gibi çok geniþ bir yelpazede beslenir. Besinlerinin çoðunu yerden, kýsa bitki örtüsü içinden ve sýð sulardan toplar. Tekeþli olarak ürerler ancak yaþam boyunca sürecek bir çift baðý kurmazlar. Hem erkeði hem de diþisi, çubuklardan oluþan ve birkaç yýl kullanýlabilen büyük bir yuva yapar. Diþi leylek her yýl bir kereliðine olmak üzere dört yumurta yumurtlar ve yavrular 33-34 gün sonra ayný anda olmamak üzere yumurtadan çýkar. Çifti oluþturan kuþlarýn ikisi de kuluçkaya yatar ve birlikte yavrularý beslerler. Yavrular yumurtadan çýktýktan 58-64 gün sonra yuvadan ayrýlýr ve 7 ila 20 gün daha ebeveynler tarafýndan beslenir. +Leylek, Dünya Doða ve Doðal Kaynaklarý Koruma Birliði (IUCN) tarafýndan asgari endiþe altýndaki türler arasýnda sýnýflandýrýlmýþtýr. Orta Çað boyunca ormanlarýn azalmasý leyleklerin yararýna olmuþtur ancak tarým pratiklerinin deðiþmesi ve sanayileþme 19. yüzyýlda ve 20. yüzyýlýn baþlarýnda Avrupa'nýn bazý bölgelerinde popülasyonlarýnýn azalmasýna ve hatta yok olmasýna neden olmuþtur. Avrupa çapýndaki koruma programlarýnýn sonucunda leyleklerin tekrar Hollanda, Belçika, Ýsviçre ve Ýsveç'te üremeleri saðlanmýþtýr. Doðal düþmanlarýnýn sayýsý azdýr ancak çeþitli parazitler taþýyabilir. Dikkat çekici bir tür olan leylek tarih boyunca bulunduðu bölgelerde çeþitli söylencelere konu olmuþtur. Bunlarýn en bilineni, bebeklerin leylekler tarafýndan getirildiði söylencesidir. +€ ‚ ƒ „ … † ‡ Š \ No newline at end of file diff --git a/test/data/encodings/windows_1255 b/test/data/encodings/windows_1255 new file mode 100644 index 0000000..461a9a8 --- /dev/null +++ b/test/data/encodings/windows_1255 @@ -0,0 +1,32 @@ +îä æä éåðé÷åã (Unicode)? + +éåðé÷åã î÷öä îñôø ééçåãé ìëì úå, +ìà îùðä òì àéæå ôìèôåøîä, +ìà îùðä áàéæå úåëðéú, +åìà îùðä áàéæå ùôä. + +áàåôï áñéñé, îçùáéí òåñ÷éí ø÷ áîñôøéí. äí îàçñðéí àåúéåú åúååéí àçøéí òì-éãé ä÷öàú îñôø ìëì àçã îäí. áèøí äåîöà äéåðé÷åã, äéå îàåú îòøëåú ÷éãåã ùåðåú ìä÷öàú äîñôøéí äììå. àó ìà àçú îäï éëìä ìäëéì ëîåú úååéí îñô÷ú. ìãåâîà: ø÷ ìàéçåã äàéøåôàé ðãøùéí ëîä ñåâé ÷éãåãéí ùåðéí òì îðú ìëñåú àú ëì äùôåú äîãåáøåú áå. éúéøä îæàú àó ìùôä áåããú, ëîå àðâìéú ìîùì, ìà äéä ãé áîòøëú ÷éãåã àçú áòáåø ëì äàåúéåú, ñéîðé äôéñå÷ åäñîìéí äèëðééí ùáùéîåù ùåèó. + +îòøëåú ÷éãåã àìå àó ñåúøåú æå àú æå. ëìåîø, ùðé ÷éãåãéí éëåìéí ìäùúîù áàåúå îñôø ìùðé úåéí ðáãìéí, àå ìäùúîù áîñôøéí ùåðéí ìàåúå úå. òì ëì îçùá (åáîéåçã ùøúéí) ìúîåê áîñôø øá ùì îòøëåú ÷éãåã ùåðåú; àåìí ëì àéîú ùðúåðéí òåáøéí áéï îòøëåú ÷éãåã àå ôìèôåøîåú ùåðåú ÷ééí äñéëåï ùééôâîå. +éåðé÷åã îùðä àú ëì æä! + +éåðé÷åã î÷öä îñôø ééçåãé ìëì úå, ììà úìåú áôìèôåøîä, áúåëðéú, àå áùôä. ú÷ï äéåðé÷åã àåîõ òì-éãé äîåáéìéí áúòùééä ëîå Appleþ, HPþ, IBMþ, JustSystemþ, Microsoftþ, Oracleþ, SAPþ, Sunþ, Sybaseþ, Unisysþ åøáéí àçøéí. éåðé÷åã ðãøù òì-éãé ú÷ðéí îåãøðééí ëîå XMLþ, Javaþ, ECMAScript (JavaScript)ýþ, LDAPþ, CORBA 3.0ýþ, WMLþ åëãåîä, åîäååä ìîòùä àú äééùåí äøùîé ùì ú÷ï ISO/IEC 10646. äåà ðúîê òì éãé îòøëåú äôòìä øáåú, ëì äãôãôðéí äçãéùéí, åîåöøéí øáéí àçøéí. äåôòú ú÷ï äéåðé÷åã åæîéðåú äëìéí äúåîëéí áå ðîðåú òí äîâîåú äëìì-òåìîéåú äçùåáåú áéåúø, àùø îñúîðåú ìàçøåðä áèëðåìåâééú äúåëðä. + +ùéìåá éåðé÷åã áééùåîé ùøú-ì÷åç àå áééùåîéí øáé-ùëáåú åáàúøé àéðèøðè îàôùø çéñëåï ðéëø áòìåéåú ìòåîú äùéîåù áñãøåú äúååéí äîñåøúéåú. äåãåú ìéåðé÷åã, îåöø úåëðä àçã àå àúø éçéã áøùú éëåì ìäøçéá àú éòãéå ìîâååï ôìèôåøîåú, àøöåú åùôåú ììà öåøê áùéðåééí îøçé÷éí. éåðé÷åã îàôùø îòáø ðúåðéí ãøê îòøëåú øáåú åùåðåú îáìé ùééôâîå. +ôøèéí àåãåú ä÷åðñåøöéåí ùì éåðé÷åã (Unicode Consortium) + +ä÷åðñåøöéåí ùì éåðé÷åã äåà àøâåï ììà îèøú øååç ùðåñã ëãé ìôúç, ìäøçéá åì÷ãí àú äùéîåù áú÷ï éåðé÷åã, àùø îâãéø àú ééöåâ äè÷ñè áîåöøé úåëðä åú÷ðéí îåãøðééí. çáøéí á÷åðñåøöéåí îâååï øçá ùì úàâéãéí åàøâåðéí áúòùééú äîçùáéí åòéáåã äîéãò. ä÷åðñåøöéåí îîåîï òì-éãé ãîé-çáø áìáã. äçáøåú á÷åðñåøöéåí éåðé÷åã ôúåçä ìàøâåðéí åìàðùéí ôøèééí, áëì øçáé äòåìí, àùø úåîëéí áú÷ï éåðé÷åã åîòåðééðéí ìñééò áäúôúçåúå åäèîòúå. + +ìîéãò ðåñó, øàä îéìåï îåðçéí, øùéîä çì÷éú ùì îåöøéí îåúàîéí ìéåðé÷åã, îáåà èëðé å- çåîøé òæø [÷éùåøéí áàðâìéú]. +Ë + +€ 130 ƒ +132 … † ‡ +135 + +136 ‰ +2030 +137 + +138 ‹ +2039 \ No newline at end of file diff --git a/test/data/encodings/windows_1256 b/test/data/encodings/windows_1256 new file mode 100644 index 0000000..4cc460b --- /dev/null +++ b/test/data/encodings/windows_1256 @@ -0,0 +1,8 @@ +ÞÈá ÍÑÈ 1948 ßÇäÊ ÇáãäØÞÉ ÌÒÁÇð ãä ÇáÇäÊÏÇÈ ÇáÈÑíØÇäí Úáì ÝáÓØíä. ÈÞíÊ ÃÑÇÖí ÇáÖÝÉ ÇáÛÑÈíÉ Ýí ÃíÇÏí ÇáÌíÔ ÇáÃÑÏäí ÈÚÏ ÇáÊæÞíÚ Úáì ÇÊÝÇÞíÇÊ ÇáåÏäÉ (ÇÊÝÇÞíÇÊ ÑæÏÓ) ÇáÊí ÃäåÊ ÇáÍÑÈ ÚÇã 1949 æÑÓãÊ ÇáÍÏæÏ ÇáÝÇÕáÉ Èíä ÇáÖÝÉ ÇáÛÑÈíÉ æÇáÃÑÇÖí ÇáÊí ÃÞíãÊ ÚáíåÇ ÏæáÉ ÅÓÑÇÆíá. åÐå ÇáÍÏæÏ (ÇáÊí åí ÌÒÁ ãä ÇáÎØ ÇáÃÎÖÑ) ÖãÊ Åáì ÇáÖÝÉ ÇáÛÑÈíÉ ÇáÌÒÁ ÇáÔÑÞí áãÏíäÉ ÇáÞÏÓ¡ ÈãÇ Ýí Ðáß ÇáÈáÏÉ ÇáÞÏíãÉ¡ ãÇ ÚÏÇ ÌÈá ÇáãÔÇÑÝ. +ÊãÊ ÇáæÍÏÉ Èíä ÇáÖÝÊíä ÇáÔÑÞíÉ (ÇáÃÑÏäíÉ) æÇáÛÑÈíÉ (ÇáÝáÓØíäíÉ) ÈÚÏ ãÄÊãÑ ÃÑíÍÇ ÚÇã 1951ã ÇáÐí ØÇáÈ ÈÇáæÍÏÉ. ÙáÊ åÐå ÇáæÍÏÉ ÞÇÆãÉ ãÚ ÇáÖÝÉ ÇáÔÑÞíÉ æÇÚÊÈÇÑ ÃåÇáí ÇáÖÝÉ ÇáÛÑÈíÉ ãæÇØäííä ÃÑÏäííä ÍÊì ÚÇã 1988 ÚäÏãÇ ÞÑÑ Çáãáß ÍÓíä ÇáÑÇÍá Ýß ÇáÇÑÊÈÇØ ÇáÞÇäæäí æÇáÅÏÇÑí æÇáãÇáí (ÞÑÇÑ Ýß ÇáÇÑÊÈÇØ) ÈäÇÁÇ Úáì ØáÈ ãäÙãÉ ÇáÊÍÑíÑ ÇáÝáÓØíäíÉ ãÇÚÏÇ ÇáÃæÞÇÝ ÇáÊí ÈÞíÊ ãÑÊÈØÉ ãÚ ÇáÍßæãÉ ÇáÃÑÏäíÉ ÍÊì Çáíæã ãä ÅÔÑÇÝ æÊÚííäÇÊ æÕíÇäÉ ááÃæÞÇÝ ÇáãÓíÍíÉ æÇáÅÓáÇãíÉ æÇáÊÒÇãÇÊ ãÇáíÉ. +Ýí 5 ÍÒíÑÇä 1967 ÇÍÊáÊ ÅÓÑÇÆíá ÃÑÇÖí ÇáÖÝÉ ÇáÛÑÈíÉ (æÃÑÇÖò ÃÎÑì) ÅÈÇä ÍÑÈ ÇáÃíÇã ÇáÓÊÉ (ÇáäßÓÉ) æáÇ ÊÒÇá ÇáÖÝÉ ÎÇÖÚÉ áÃÍßÇã ÇÊÝÇÞíÉ ÌäíÝ ÇáÑÇÈÚÉ ááÃÑÇÖí ÇáãÍÊáÉ. Úáì ÇáÑÛã ãä Ðáß ÞÇãÊ ÅÓÑÇÆíá ÈÈäÇÁ ÇáÚÏíÏ ãä ÇáÜãÓÊæØäÇÊ Ýí ÇáÖÝÉ. ßãÇ ÞÇãÊ ÅÓÑÇÆíá ÈÖã ÇáÞÏÓ ÇáÔÑÞíÉ æÖæÇÍíåÇ ÈÔßá ÃÍÇÏí ÇáÌÇäÈ áã íÚÊÑÝ ÈÔÑÚíÊÉ ÇáãÌÊãÚ ÇáÏæáí. ÊØáÞ ÇáÍßæãÉ ÇáÅÓÑÇÆíáíÉ Úáì ÇáãäØÞÉ ÇÓã "íåæÏÇ æÔæãÑæä" (Ãí "íåæÐÇ æÇáÓÇãÑÉ")¡ ÍíË ÊÐßÑ ÈåÐÇ ÇáÇÓã Ýí ÇáæËÇÆÞ ÇáÅÓÑÇÆíáíÉ ÇáÑÓãíÉ. +Ýí ÚÇã 1993 æÞÚÊ ÅÓÑÇÆíá æãäÙãÉ ÇáÊÍÑíÑ ÇáÝáÓØíäíÉ ÇÊÝÇÞíÉ ÃæÓáæ ÇáÊí äÕÊ Úáì ÅÞÇãÉ ÍßæãÉ ÐÇÊíÉ ÝáÓØíäíÉ ÊÏíÑ ÇáÍíÇÉ ÇáãÏäíÉ Ýí ÇáÖÝÉ ÇáÛÑÈíÉ æÞØÇÚ ÛÒÉ áÝÊÑÉ ÇäÊÞÇáíÉ¡ Úáì Ãä ÊÓÊÃäÝ ÇáãÝÇæÖÇÊ Ýí ÇáÞÖÇíÇ ÇáãÊÈÞíÉ¡ ßÇáÞÏÓ æÇááÇÌÆíä. æÈÇáÝÚá æÝí ÚÇã 1994 ÃÞíãÊ ÇáÓáØÉ ÇáæØäíÉ ÇáÝáÓØíäíÉ Ýí ÈÚÖ ÇáãÏä æÇáÞÑì ÇáÝáÓØíäíÉ ÈÇáÊÏÑíÌ¡ æáßäåÇ ãäÐ ÇäÊÝÇÖÉ ÇáÃÞÕì áÇ ÊÓÊØíÚ ÇáÞíÇã ÈæÇÌÈÇÊåÇ ÈÔßá äÇÌÍ. áÇ íÒÇá åäÇß ãÝÇæÖÇÊ Èíä ÇáÅÓÑÇÆáííä æÇáÝáÓØíäííä æáßäåÇ ßËíÑÇð ãÇ ÊÊÚËÑ ÈÓÈÈ ÅÕÑÇÑ ÅÓÑÇÆíá Úáì ãÊÇÈÚÉ ÇÓÊíØÇäåÇ Ýí ÇáÖÝÉ ÇáÛÑÈíÉ. + + +ÇáÌÏÇÑ ÇáÝÇÕá ÌÑÝ ÇáßËíÑ ãä ÃÑÇÖí ÇáÖÝÉ ÇáÛÑÈíÉ +Ýí ÃÈÑíá 2002 ÔÑÚÊ ÇáÍßæãÉ ÇáÅÓÑÇÆíáíÉ ÈÑÆÇÓÉ ÃÑííá ÔÇÑæä ÈÈäÇÁ ÌÏÇÑ ÝÇÕá ÈíäåÇ æÈíä ÇáÝáÓØíäííä ÏÇÎá ÃÑÇÖí ÇáÖÝÉ ÇáÛÑÈíÉ ÞÇáÊ Ãäå ÈåÏÝ ÍãÇíÉ ÅÓÑÇÆíá ãä ÇáÚãáíÇÊ ÇáÚÓßÑíÉ ÇáÝáÓØíäíÉ. áßäå ÇÞÊÖã ÇáßËíÑ ãä ÇáÃÑÇÖí ÇáÝáÓØíäíÉ æÓÇåã Ýí ÅÍßÇã ÇáÍÕÇÑ Úáì ÇáÔÚÈ ÇáÝáÓØíäí æÅÝÞÇÑ ÇÞÊÕÇÏå ÇáæØäí ÈÔßá ÔÈå ßÇãá. ßãÇ Êã ÚÒá ãÏä æÈáÏÇÊ ÈßÇãáåÇ Úä ãÍíØåÇ ÇáÝáÓØíäí. \ No newline at end of file diff --git a/test/encodings/sbcs.js b/test/encodings/sbcs.js index e369a93..e6588f6 100644 --- a/test/encodings/sbcs.js +++ b/test/encodings/sbcs.js @@ -6,38 +6,106 @@ describe('Singlebyte Character Sets', function() { var base = __dirname + '/../data/encodings'; - it('should return ISO-8859-1', function() { + it('should return ISO-8859-1 (English)', function() { assert.equal( - chardet.detectFileSync(base + '/iso88591_en'), - 'ISO-8859-1' + chardet.detectFileSync(base + '/iso88591_en'), 'ISO-8859-1' ); }); - it('should return ISO-8859-2', function() { + it('should return ISO-8859-2 (Czech)', function() { assert.equal( - chardet.detectFileSync(base + '/iso88592_cs'), - 'ISO-8859-2' + chardet.detectFileSync(base + '/iso88592_cs'), 'ISO-8859-2' ); }); - it('should return ISO-8859-5', function() { + it('should return ISO-8859-3'); + it('should return ISO-8859-4'); + + it('should return ISO-8859-5 (Russian)', function() { assert.equal( - chardet.detectFileSync(base + '/iso88595_ru'), - 'ISO-8859-5' + chardet.detectFileSync(base + '/iso88595_ru'), 'ISO-8859-5' ); }); - it('should return ISO-8859-6', function() { + it('should return ISO-8859-6 (Arabic)', function() { assert.equal( - chardet.detectFileSync(base + '/iso88596_ar'), - 'ISO-8859-6' + chardet.detectFileSync(base + '/iso88596_ar'), 'ISO-8859-6' ); }); - it('should return ISO-8859-7', function() { + it('should return ISO-8859-7 (Greek)', function() { assert.equal( - chardet.detectFileSync(base + '/iso88597_el'), - 'ISO-8859-7' + chardet.detectFileSync(base + '/iso88597_el'), 'ISO-8859-7' + ); + }); + + it('should return ISO-8859-8 (Hebrew)', function() { + assert.equal( + chardet.detectFileSync(base + '/iso88598_he'), 'ISO-8859-8' + ); + }); + + it('should return ISO-8859-9 (Turkish)', function() { + assert.equal( + chardet.detectFileSync(base + '/iso88599_tr'), 'ISO-8859-9' + ); + }); + + it('should return ISO-8859-10'); + it('should return ISO-8859-11'); + // iso-8859-12 is abandoned + it('should return ISO-8859-13'); + it('should return ISO-8859-14'); + it('should return ISO-8859-15'); + it('should return ISO-8859-16'); + + + it('should return windows-1250 (Czech)', function() { + assert.equal( + chardet.detectFileSync(base + '/windows_1250'), 'windows-1250' + ); + }); + + it('should return windows-1251 (Russian)', function() { + assert.equal( + chardet.detectFileSync(base + '/windows_1251'), 'windows-1251' + ); + }); + + it('should return windows-1252 (English)', function() { + assert.equal( + chardet.detectFileSync(base + '/windows_1252'), 'windows-1252' + ); + }); + + it('should return windows-1253 (Greek)', function() { + assert.equal( + chardet.detectFileSync(base + '/windows_1253'), 'windows-1253' + ); + }); + + it('should return windows-1254 (Turkish)', function() { + assert.equal( + chardet.detectFileSync(base + '/windows_1254'), 'windows-1254' + ); + }); + + it('should return windows-1255 (Hebrew)', function() { + assert.equal( + chardet.detectFileSync(base + '/windows_1255'), 'windows-1255' + ); + }); + + it('should return windows-1256 (Arabic)', function() { + assert.equal( + chardet.detectFileSync(base + '/windows_1256'), 'windows-1256' + ); + }); + + + it('should return KOI8-R (Russian)', function() { + assert.equal( + chardet.detectFileSync(base + '/koi8r'), 'KOI8-R' ); });