|
Server : Apache/2.4.62 System : FreeBSD fbsdweb2.web.rcn.net 14.1-RELEASE FreeBSD 14.1-RELEASE releng/14.1-n267679-10e31f0946d8 GENERIC amd64 User : www ( 80) PHP Version : 8.3.8 Disable Function : NONE Directory : /domains/mandarintools/cgi-bin/ |
Upload File : |
# -*- perl -*-
# codelib.pl
#
# Feb. 1997 by Erik E. Peterson, [email protected]
# Free for non-commercial use
# Fee required if used in commercial program.
#
# Perl functions to guess what Chinese encoding a given
# document uses. A probability is generated for each
# code. The probabilities are sorted and returned
# Probabilities are given on a scale of 100, but if
# no guess scores above a 50, "other" is returned as the
# guess. Even when a document is in a given encoding,
# the probability will generally not go above 80.
# To use:
# call codeguess with one argument, a text string
# It will return the sorted array, @codes, of encoding names. The
# value of index 0 of the array is the guess for the text string.
# The rest of the array is the other encodings in order of
# likelihood.
# For example, to guess the encoding for a file:
# require "codelib.pl";
# open(TXTFILE, "somefile.txt");
# undef $\;
# $srctxt = <TXTFILE>;
# close(TXTFILE);
# @guesses = codeguess($srctxt);
# print "The encoding for somefile.txt is $guesses[0].\n";
# Supported Chinese Encodings:
# GB2312-80, Hz, Big5, UTF-8, ASCII; Later SJIS, ISO-8859-1
%gbhash = (
'��' => 500, '��' => 499, '��' => 498, '��' => 497, 'һ' => 496, '��' => 495, '��' => 494, '��' => 493, '��' => 492, '��' => 491,
'��' => 490, '��' => 489, '��' => 488, 'Ҫ' => 487, '��' => 486, '��' => 485, '��' => 484, '��' => 483, '��' => 482, '��' => 481,
'��' => 480, 'Ϊ' => 479, '��' => 478, '��' => 477, 'ѧ' => 476, '��' => 475, '��' => 474, 'Ҳ' => 473, '��' => 472, '��' => 471,
'��' => 470, 'ʱ' => 469, '��' => 468, '˵' => 467, 'û' => 466, '�' => 465, '��' => 464, '��' => 463, '��' => 462, '�' => 461,
'��' => 460, '��' => 459, '��' => 458, '��' => 457, '��' => 456, '��' => 455, '�' => 454, '��' => 453, '��' => 452, 'С' => 451,
'֮' => 450, '��' => 449, '��' => 448, '��' => 447, '��' => 446, '��' => 445, '��' => 444, '��' => 443, '��' => 442, '��' => 441,
'��' => 440, '��' => 439, '��' => 438, '��' => 437, 'Ѷ' => 436, '��' => 435, 'վ' => 434, '��' => 433, 'ֻ' => 432, 'ȥ' => 431,
'֪' => 430, '��' => 429, '��' => 428, '̨' => 427, '��' => 426, '��' => 425, 'ͬ' => 424, '��' => 423, '��' => 422, '��' => 421,
'��' => 420, '��' => 419, '��' => 418, '��' => 417, 'Ȼ' => 416, '��' => 415, '��' => 414, '�' => 413, '��' => 412, '��' => 411,
'��' => 410, '��' => 409, 'ǰ' => 408, '��' => 407, '��' => 406, '��' => 405, '��' => 404, '��' => 403, '��' => 402, '��' => 401,
'��' => 400, '��' => 399, '��' => 398, '��' => 397, '��' => 396, '��' => 395, '��' => 394, '��' => 393, 'Щ' => 392, '��' => 391,
'�' => 390, '�' => 389, '��' => 388, '��' => 387, 'λ' => 386, '�' => 385, '��' => 384, '��' => 383, '��' => 382, '��' => 381,
'�' => 380, 'ʲ' => 379, 'л' => 378, '��' => 377, '�' => 376, '��' => 375, '��' => 374, '��' => 373, 'ʵ' => 372, '��' => 371,
'��' => 370, '��' => 369, '��' => 368, '��' => 367, '��' => 366, '��' => 365, '��' => 364, '��' => 363, '��' => 362, '��' => 361,
'��' => 360, 'Ӧ' => 359, 'Ů' => 358, '��' => 357, '��' => 356, '��' => 355, '��' => 354, '��' => 353, 'ϵ' => 352, '��' => 351,
'��' => 350, '��' => 349, '��' => 348, '�' => 347, '��' => 346, '̫' => 345, '·' => 344, '��' => 343, '��' => 342, '��' => 341,
'�' => 340, '��' => 339, 'ʮ' => 338, '��' => 337, '��' => 336, '��' => 335, '��' => 334, 'ʹ' => 333, '��' => 332, '��' => 331,
'��' => 330, '��' => 329, '�' => 328, '��' => 327, '��' => 326, '��' => 325, 'ȫ' => 324, '��' => 323, '��' => 322, '��' => 321,
'��' => 320, '��' => 319, '��' => 318, 'ʽ' => 317, '��' => 316, '��' => 315, '��' => 314, '��' => 313, '��' => 312, '��' => 311,
'��' => 310, '��' => 309, '��' => 308, '��' => 307, '��' => 306, 'У' => 305, '��' => 304, '��' => 303, '��' => 302, '��' => 301,
'�' => 300, 'ˮ' => 299, '�' => 298, '��' => 297, '�' => 296, '��' => 295, '��' => 294, '��' => 293, '��' => 292, '��' => 291,
'��' => 290, '��' => 289, '��' => 288, '��' => 287, '��' => 286, 'ԭ' => 285, '�' => 284, '��' => 283, '��' => 282, 'ͨ' => 281,
'��' => 280, '�' => 279, '�' => 278, '��' => 277, '��' => 276, '��' => 275, '��' => 274, '��' => 273, '��' => 272, '��' => 271,
'��' => 270, '��' => 269, '��' => 268, '��' => 267, 'ѡ' => 266, '��' => 265, '�' => 264, '��' => 263, '��' => 262, '��' => 261,
'�' => 260, '��' => 259, '��' => 258, '��' => 257, '�' => 256, '��' => 255, 'Ŀ' => 254, '��' => 253, '��' => 252, '�' => 251,
'�' => 250, '��' => 249, '��' => 248, '��' => 247, '��' => 246, '��' => 245, '��' => 244, '��' => 243, '��' => 242, 'ͷ' => 241,
'ϲ' => 240, '��' => 239, '��' => 238, '��' => 237, '��' => 236, '��' => 235, '��' => 234, '��' => 233, 'ͳ' => 232, '��' => 231,
'��' => 230, '��' => 229, '��' => 228, '��' => 227, '��' => 226, '��' => 225, '��' => 224, '��' => 223, '��' => 222, 'ʦ' => 221,
'��' => 220, '�' => 219, '��' => 218, '��' => 217, '��' => 216, '˭' => 215, 'ɽ' => 214, 'ÿ' => 213, '��' => 212, '��' => 211,
'��' => 210, '��' => 209, '��' => 208, '��' => 207, '��' => 206, '��' => 205, '��' => 204, '��' => 203, '��' => 202, 'ֱ' => 201,
'��' => 200, '��' => 199, '��' => 198, 'ת' => 197, '��' => 196, 'ָ' => 195, '��' => 194, '��' => 193, '�' => 192, '�' => 191,
'��' => 190, 'ϣ' => 189, '��' => 188, 'ȡ' => 187, '��' => 186, '��' => 185, '�' => 184, '��' => 183, '��' => 182, '��' => 181,
'�' => 180, '��' => 179, '��' => 178, '��' => 177, '�' => 176, '��' => 175, '��' => 174, 'ʿ' => 173, 'ս' => 172, '��' => 171,
'��' => 170, '��' => 169, 'ȴ' => 168, '��' => 167, '��' => 166, 'ҵ' => 165, '��' => 164, 'д' => 163, 'Ӱ' => 162, '�' => 161,
'ƽ' => 160, '' => 159, 'Ա' => 158, '��' => 157, '��' => 156, 'ɫ' => 155, '��' => 154, '��' => 153, '��' => 152, 'Ƭ' => 151,
'��' => 150, '��' => 149, '��' => 148, '��' => 147, '��' => 146, 'ǿ' => 145, '��' => 144, '��' => 143, '��' => 142, '��' => 141,
'��' => 140, '��' => 139, '��' => 138, '��' => 137, '˼' => 136, '��' => 135, 'Ԫ' => 134, '��' => 133, '��' => 132, '��' => 131,
'��' => 130, '��' => 129, '��' => 128, '��' => 127, 'δ' => 126, '��' => 125, '��' => 124, '��' => 123, '��' => 122, '��' => 121,
'��' => 120, 'Ц' => 119, 'Ͷ' => 118, '��' => 117, '��' => 116, '�' => 115, '��' => 114, '�' => 113, '��' => 112, '��' => 111,
'֧' => 110, '��' => 109, 'è' => 108, '��' => 107, '��' => 106, '��' => 105, '��' => 104, '��' => 103, '��' => 102, '��' => 101,
'��' => 100, '��' => 99, '��' => 98, '��' => 97, '��' => 96, '��' => 95, '��' => 94, '�' => 93, '��' => 92, '��' => 91,
'��' => 90, '��' => 89, '��' => 88, '��' => 87, '��' => 86, '��' => 85, '��' => 84, '��' => 83, 'Ǯ' => 82, 'ʧ' => 81,
'��' => 80, 'ס' => 79, '��' => 78, '��' => 77, '¼' => 76, 'ר' => 75, '��' => 74, '��' => 73, '��' => 72, '��' => 71,
'��' => 70, 'Զ' => 69, '��' => 68, '��' => 67, '��' => 66, '��' => 65, '��' => 64, '��' => 63, 'ȷ' => 62, '��' => 61,
'��' => 60, '��' => 59, 'װ' => 58, '��' => 57, '��' => 56, '�' => 55, '��' => 54, '��' => 53, '��' => 52, 'Ӣ' => 51,
'��' => 50, '��' => 49, '��' => 48, '��' => 47, '��' => 46, '��' => 45, '��' => 44, '��' => 43, '�' => 42, '�' => 41,
'��' => 40, '��' => 39, 'ͼ' => 38, '��' => 37, '��' => 36, '�' => 35, '��' => 34, 'ʶ' => 33, '��' => 32, '��' => 31,
'��' => 30, 'Ү' => 29, 'Ʒ' => 28, '��' => 27, '��' => 26, 'ʼ' => 25, '��' => 24, '��' => 23, '��' => 22, 'Ȩ' => 21,
'��' => 20, '��' => 19, '��' => 18, '��' => 17, '��' => 16, '��' => 15, '�' => 14, '��' => 13, '��' => 12, '��' => 11,
'��' => 10, '��' => 9, '��' => 8, '��' => 7, '��' => 6, '��' => 5, '��' => 4, '��' => 3, '��' => 2, 'Լ' => 1, );
%b5hash = (
'��' => 500, '�O' => 499, '��' => 498, '��' => 497, '�@' => 496, '��' => 495, '�j' => 494, '�b' => 493, '�H' => 492, '�F' => 491,
'��' => 490, '��' => 489, '��' => 488, '�n' => 487, '�H' => 486, '�i' => 485, '�o' => 484, '��' => 483, '�A' => 482, '�|' => 481,
'�n' => 480, '��' => 479, '�W' => 478, '��' => 477, '��' => 476, '�N' => 475, '��' => 474, '�]' => 473, '��' => 472, '��' => 471,
'�p' => 470, '��' => 469, '��' => 468, '��' => 467, '�S' => 466, '�L' => 465, '��' => 464, '��' => 463, '��' => 462, '��' => 461,
'��' => 460, '�U' => 459, '�L' => 458, '��' => 457, '��' => 456, '��' => 455, '��' => 454, '�h' => 453, '��' => 452, '�p' => 451,
'��' => 450, '�Q' => 449, '�o' => 448, '�u' => 447, '�X' => 446, '��' => 445, '�q' => 444, '��' => 443, '��' => 442, '��' => 441,
'��' => 440, '��' => 439, '�l' => 438, '��' => 437, '�T' => 436, '�a' => 435, '��' => 434, '��' => 433, '�u' => 432, '�h' => 431,
'��' => 430, '��' => 429, '��' => 428, '�x' => 427, '��' => 426, '�H' => 425, '�P' => 424, '��' => 423, '��' => 422, '�D' => 421,
'�o' => 420, '�a' => 419, '�k' => 418, '�L' => 417, '�M' => 416, '��' => 415, '��' => 414, '��' => 413, '��' => 412, '��' => 411,
'�~' => 410, '�{' => 409, '�e' => 408, '��' => 407, '�u' => 406, '�s' => 405, '�M' => 404, '�]' => 403, '�G' => 402, '�N' => 401,
'�w' => 400, '�I' => 399, '��' => 398, '��' => 397, '�D' => 396, '��' => 395, '��' => 394, '��' => 393, '��' => 392, '�M' => 391,
'�T' => 390, '��' => 389, '��' => 388, '�a' => 387, '��' => 386, '�@' => 385, '�z' => 384, '��' => 383, '��' => 382, '�g' => 381,
'�W' => 380, '��' => 379, '��' => 378, '��' => 377, '��' => 376, '�}' => 375, '��' => 374, '�P' => 373, '��' => 372, '�R' => 371,
'�A' => 370, '��' => 369, '�G' => 368, '��' => 367, '��' => 366, '��' => 365, '��' => 364, '��' => 363, '�S' => 362, '��' => 361,
'�O' => 360, '��' => 359, '�k' => 358, '��' => 357, '��' => 356, '��' => 355, '��' => 354, '��' => 353, '�t' => 352, '��' => 351,
'��' => 350, '��' => 349, '�w' => 348, '��' => 347, '��' => 346, '��' => 345, '��' => 344, '�_' => 343, '�v' => 342, '��' => 341,
'�D' => 340, '�' => 339, '�Q' => 338, '��' => 337, '�~' => 336, '�O' => 335, 'ı' => 334, '��' => 333, '��' => 332, '��' => 331,
'�~' => 330, '�i' => 329, '�o' => 328, '��' => 327, '��' => 326, '�U' => 325, '��' => 324, '�N' => 323, '��' => 322, '��' => 321,
'�[' => 320, '�^' => 319, '�P' => 318, '��' => 317, '��' => 316, '�y' => 315, '��' => 314, '��' => 313, '�{' => 312, '��' => 311,
'�Q' => 310, '��' => 309, '��' => 308, '��' => 307, '�s' => 306, '��' => 305, '�O' => 304, '��' => 303, '��' => 302, '��' => 301,
'ť' => 300, '��' => 299, '��' => 298, '�`' => 297, '�z' => 296, '��' => 295, '��' => 294, '�F' => 293, '��' => 292, '��' => 291,
'�W' => 290, '��' => 289, '��' => 288, '��' => 287, '��' => 286, '��' => 285, '��' => 284, '��' => 283, '��' => 282, '�q' => 281,
'��' => 280, '��' => 279, '��' => 278, '��' => 277, '��' => 276, '�_' => 275, '��' => 274, '�J' => 273, '��' => 272, '��' => 271,
'�' => 270, '�|' => 269, '��' => 268, '��' => 267, '��' => 266, '��' => 265, '��' => 264, '��' => 263, '��' => 262, '��' => 261,
'��' => 260, '�q' => 259, '��' => 258, '�w' => 257, '��' => 256, '��' => 255, '��' => 254, '��' => 253, '�X' => 252, '��' => 251,
'��' => 250, '�{' => 249, '�i' => 248, '��' => 247, '�X' => 246, '��' => 245, '��' => 244, '�' => 243, '��' => 242, '�Y' => 241,
'��' => 240, '�\\' => 239, '��' => 238, '��' => 237, '�R' => 236, '��' => 235, '��' => 234, '�Y' => 233, '��' => 232, '��' => 231,
'�O' => 230, '�N' => 229, '��' => 228, '�B' => 227, '��' => 226, '��' => 225, '�p' => 224, '��' => 223, '�r' => 222, '�v' => 221,
'��' => 220, '�F' => 219, '��' => 218, '�i' => 217, '�k' => 216, '��' => 215, '�s' => 214, '�C' => 213, '��' => 212, '�B' => 211,
'�P' => 210, '�D' => 209, '��' => 208, '��' => 207, '�s' => 206, '��' => 205, '��' => 204, '��' => 203, '��' => 202, '��' => 201,
'�]' => 200, '��' => 199, '��' => 198, '��' => 197, '��' => 196, '��' => 195, '��' => 194, '��' => 193, '��' => 192, '��' => 191,
'��' => 190, '��' => 189, '��' => 188, '��' => 187, '��' => 186, '��' => 185, '��' => 184, '��' => 183, '�L' => 182, '��' => 181,
'�@' => 180, '��' => 179, '��' => 178, '�q' => 177, '��' => 176, '�K' => 175, '��' => 174, '�h' => 173, '��' => 172, '��' => 171,
'��' => 170, '��' => 169, '�o' => 168, '�_' => 167, '��' => 166, '�~' => 165, '�\\' => 164, '�g' => 163, '�v' => 162, '�n' => 161,
'��' => 160, '�O' => 159, '��' => 158, '��' => 157, '�Q' => 156, '��' => 155, '�h' => 154, '�e' => 153, '��' => 152, '��' => 151,
'�V' => 150, '�p' => 149, '��' => 148, '�Q' => 147, '��' => 146, '��' => 145, '�j' => 144, '�w' => 143, '��' => 142, '�S' => 141,
'ij' => 140, '��' => 139, '��' => 138, '�`' => 137, '��' => 136, '��' => 135, '��' => 134, '��' => 133, '�s' => 132, '�O' => 131,
'��' => 130, '�D' => 129, '�s' => 128, '��' => 127, '��' => 126, '��' => 125, '�M' => 124, '��' => 123, '�U' => 122, '��' => 121,
'��' => 120, '��' => 119, '��' => 118, '��' => 117, '��' => 116, '��' => 115, '��' => 114, '��' => 113, '��' => 112, '��' => 111,
'�y' => 110, '��' => 109, '�W' => 108, '��' => 107, '�d' => 106, '��' => 105, '�S' => 104, '��' => 103, '�@' => 102, '�y' => 101,
'��' => 100, '�f' => 99, '��' => 98, '�u' => 97, '��' => 96, '��' => 95, '��' => 94, '��' => 93, '�R' => 92, '��' => 91,
'�[' => 90, '�p' => 89, '��' => 88, '�B' => 87, '��' => 86, '��' => 85, '�K' => 84, '��' => 83, '��' => 82, '��' => 81,
'��' => 80, '�Y' => 79, '��' => 78, '�Y' => 77, '�t' => 76, '��' => 75, '�M' => 74, '�H' => 73, '��' => 72, '��' => 71,
'�O' => 70, '��' => 69, '��' => 68, '�t' => 67, '��' => 66, '��' => 65, '��' => 64, '�q' => 63, '��' => 62, '�T' => 61,
'��' => 60, '��' => 59, '��' => 58, '��' => 57, '�' => 56, '��' => 55, '��' => 54, '�n' => 53, '��' => 52, '�e' => 51,
'�^' => 50, '�D' => 49, '�a' => 48, '�t' => 47, '�G' => 46, '�q' => 45, '�[' => 44, '��' => 43, '��' => 42, '��' => 41,
'��' => 40, '�Z' => 39, '�g' => 38, '��' => 37, '��' => 36, '�O' => 35, '��' => 34, '�_' => 33, '��' => 32, '��' => 31,
'�W' => 30, '��' => 29, '�C' => 28, '�~' => 27, '��' => 26, '��' => 25, '�l' => 24, '�B' => 23, '��' => 22, '��' => 21,
'�v' => 20, '��' => 19, '�G' => 18, '��' => 17, 'Ū' => 16, '��' => 15, '��' => 14, '��' => 13, '�A' => 12, '��' => 11,
'��' => 10, '��' => 9, '�y' => 8, '��' => 7, '�d' => 6, '��' => 5, '�x' => 4, '�}' => 3, '��' => 2, '�b' => 1, );
%utf8hash = (
'的' => 500, '是' => 499, '不' => 498, '我' => 497, '一' => 496, '有' => 495, '大' => 494, '在' => 493, '人' => 492, '了' => 491,
'中' => 490, '到' => 489, '资' => 488, '資' => 488, '要' => 487, '以' => 486, '可' => 485, '这' => 484, '這' => 484, '个' => 483, '個' => 483, '你' => 482, '会' => 481, '會' => 481,
'好' => 480, '为' => 479, '為' => 479, '上' => 478, '来' => 477, '來' => 477, '学' => 476, '學' => 476, '就' => 475, '交' => 474, '也' => 473, '用' => 472, '能' => 471,
'如' => 470, '时' => 469, '時' => 469, '文' => 468, '说' => 467, '說' => 467, '没' => 466, '沒' => 466, '他' => 465, '看' => 464, '那' => 463, '问' => 462, '問' => 462, '生' => 461,
'提' => 460, '下' => 459, '过' => 458, '過' => 458, '请' => 457, '請' => 457, '们' => 456, '們' => 456, '天' => 455, '所' => 454, '多' => 453, '麽' => 452, '麼' => 452, '小' => 451,
'之' => 450, '想' => 449, '得' => 448, '工' => 447, '出' => 446, '还' => 445, '還' => 445, '电' => 444, '電' => 444, '对' => 443, '對' => 443, '都' => 442, '机' => 441, '機' => 441,
'自' => 440, '而' => 439, '子' => 438, '後' => 437, '讯' => 436, '訊' => 436, '家' => 435, '站' => 434, '心' => 433, '只' => 432, '去' => 431,
'知' => 430, '国' => 429, '國' => 429, '很' => 428, '台' => 427, '成' => 426, '信' => 425, '同' => 424, '何' => 423, '章' => 422, '道' => 421,
'发' => 420, '發' => 420, '地' => 419, '法' => 418, '无' => 417, '無' => 417, '然' => 416, '但' => 415, '当' => 414, '當' => 414, '於' => 413, '吗' => 412, '嗎' => 412, '本' => 411,
'年' => 410, '现' => 409, '現' => 409, '前' => 408, '最' => 407, '真' => 406, '新' => 405, '和' => 404, '因' => 403, '果' => 402, '意' => 401,
'定' => 400, '点' => 399, '點' => 399, '情' => 398, '其' => 397, '题' => 396, '題' => 396, '事' => 395, '科' => 394, '方' => 393, '些' => 392, '清' => 391,
'叁' => 390, '三' => 390, '样' => 389, '樣' => 389, '此' => 388, '吧' => 387, '位' => 386, '作' => 385, '理' => 384, '行' => 383, '者' => 382, '经' => 381, '經' => 381,
'名' => 380, '什' => 379, '谢' => 378, '謝' => 378, '日' => 377, '正' => 376, '开' => 375, '開' => 375, '话' => 374, '話' => 374, '与' => 373, '與' => 373, '实' => 372, '實' => 372, '爱' => 371, '愛' => 371,
'再' => 370, '华' => 369, '華' => 369, '二' => 368, '城' => 367, '动' => 366, '動' => 366, '比' => 365, '面' => 364, '高' => 363, '又' => 362, '或' => 361,
'力' => 360, '应' => 359, '應' => 359, '女' => 358, '种' => 357, '種' => 357, '教' => 356, '车' => 355, '車' => 355, '分' => 354, '像' => 353, '系' => 352, '长' => 351, '長' => 351,
'手' => 350, '次' => 349, '已' => 348, '明' => 347, '打' => 346, '太' => 345, '路' => 344, '起' => 343, '己' => 342, '相' => 341,
'主' => 340, '关' => 339, '關' => 339, '十' => 338, '间' => 337, '間' => 337, '外' => 336, '呢' => 335, '觉' => 334, '覺' => 334, '使' => 333, '该' => 332, '該' => 332, '友' => 331,
'才' => 330, '进' => 329, '進' => 329, '她' => 328, '民' => 327, '着' => 326, '著' => 326, '各' => 325, '全' => 324, '将' => 323, '將' => 323, '少' => 322, '两' => 321, '兩' => 321,
'加' => 320, '回' => 319, '感' => 318, '式' => 317, '第' => 316, '球' => 315, '性' => 314, '老' => 313, '程' => 312, '把' => 311,
'被' => 310, '公' => 309, '论' => 308, '論' => 308, '及' => 307, '龙' => 306, '龍' => 306, '校' => 305, '别' => 304, '別' => 304, '体' => 303, '體' => 303, '重' => 302, '给' => 301, '給' => 301,
'听' => 300, '聽' => 300, '水' => 299, '做' => 298, '常' => 297, '您' => 296, '见' => 295, '見' => 295, '里' => 294, '裡' => 294, '东' => 293, '東' => 293, '风' => 292, '風' => 292, '解' => 291,
'湾' => 290, '灣' => 290, '月' => 289, '等' => 288, '啦' => 287, '部' => 286, '原' => 285, '美' => 284, '先' => 283, '音' => 282, '通' => 281,
'管' => 280, '网' => 279, '網' => 279, '区' => 278, '區' => 278, '期' => 277, '错' => 276, '錯' => 276, '否' => 275, '乐' => 274, '樂' => 274, '入' => 273, '找' => 272, '书' => 271, '書' => 271,
'让' => 270, '讓' => 270, '四' => 269, '啊' => 268, '由' => 267, '选' => 266, '選' => 266, '较' => 265, '較' => 265, '数' => 264, '數' => 264, '表' => 263, '内' => 262, '內' => 262, '场' => 261, '場' => 261,
'它' => 260, '从' => 259, '從' => 259, '快' => 258, '欢' => 257, '歡' => 257, '至' => 256, '立' => 255, '目' => 254, '社' => 253, '合' => 252, '望' => 251,
'怎' => 250, '认' => 249, '認' => 249, '告' => 248, '更' => 247, '几' => 246, '幾' => 246, '考' => 245, '度' => 244, '难' => 243, '難' => 243, '版' => 242, '头' => 241, '頭' => 241,
'喜' => 240, '许' => 239, '許' => 239, '光' => 238, '今' => 237, '买' => 236, '買' => 236, '算' => 235, '弟' => 234, '若' => 233, '统' => 232, '統' => 232, '身' => 231,
'记' => 230, '記' => 230, '代' => 229, '号' => 228, '號' => 228, '处' => 227, '處' => 227, '完' => 226, '接' => 225, '计' => 224, '計' => 224, '言' => 223, '字' => 222, '师' => 221, '師' => 221,
'并' => 220, '並' => 220, '政' => 219, '玩' => 218, '张' => 217, '張' => 217, '男' => 216, '谁' => 215, '誰' => 215, '山' => 214, '每' => 213, '结' => 212, '結' => 212, '且' => 211,
'星' => 210, '非' => 209, '建' => 208, '改' => 207, '连' => 206, '連' => 206, '放' => 205, '哈' => 204, '活' => 203, '研' => 202, '直' => 201,
'设' => 200, '設' => 200, '陈' => 199, '陳' => 199, '报' => 198, '報' => 198, '转' => 197, '轉' => 197, '党' => 196, '黨' => 196, '指' => 195, '五' => 194, '变' => 193, '變' => 193, '气' => 192, '氣' => 192, '西' => 191,
'试' => 190, '試' => 190, '希' => 189, '神' => 188, '取' => 187, '化' => 186, '物' => 185, '王' => 184, '任' => 183, '林' => 182, '单' => 181, '單' => 181,
'世' => 180, '受' => 179, '近' => 178, '义' => 177, '義' => 177, '死' => 176, '便' => 175, '反' => 174, '士' => 173, '战' => 172, '戰' => 172, '空' => 171,
'队' => 170, '隊' => 170, '跟' => 169, '却' => 168, '卻' => 168, '北' => 167, '必' => 166, '业' => 165, '業' => 165, '功' => 164, '写' => 163, '寫' => 163, '影' => 162, '声' => 161, '聲' => 161,
'平' => 160, '' => 159, '臺' => 159, '员' => 158, '員' => 158, '金' => 157, '讨' => 156, '討' => 156, '色' => 155, '则' => 154, '則' => 154, '容' => 153, '档' => 152, '檔' => 152, '片' => 151,
'向' => 150, '市' => 149, '妳' => 149, '利' => 148, '市' => 148, '兴' => 147, '利' => 147, '白' => 146, '興' => 146, '强' => 145, '白' => 145, '安' => 144, '強' => 144, '央' => 143, '安' => 143, '特' => 142, '央' => 142, '议' => 141, '特' => 141,
'办' => 140, '議' => 140, '价' => 139, '辦' => 139, '总' => 138, '價' => 138, '传' => 137, '總' => 137, '思' => 136, '傳' => 136, '花' => 135, '思' => 135, '元' => 134, '花' => 134, '叫' => 133, '元' => 133, '保' => 132, '叫' => 132, '份' => 131, '保' => 131,
'求' => 130, '份' => 130, '究' => 129, '求' => 129, '呵' => 128, '究' => 128, '件' => 127, '呵' => 127, '未' => 126, '件' => 126, '决' => 125, '未' => 125, '组' => 124, '決' => 124, '万' => 123, '組' => 123, '竹' => 122, '萬' => 122, '级' => 121, '竹' => 121,
'持' => 120, '級' => 120, '笑' => 119, '持' => 119, '投' => 118, '笑' => 118, '哪' => 117, '投' => 117, '室' => 116, '哪' => 116, '曾' => 115, '室' => 115, '走' => 114, '曾' => 114, '喔' => 113, '走' => 113, '标' => 112, '喔' => 112, '流' => 111, '標' => 111,
'支' => 110, '流' => 110, '独' => 109, '支' => 109, '猫' => 108, '獨' => 108, '卡' => 107, '貓' => 107, '需' => 106, '卡' => 106, '兄' => 105, '需' => 105, '门' => 104, '兄' => 104, '共' => 103, '門' => 103, '语' => 102, '共' => 102, '海' => 101, '語' => 101,
'口' => 100, '海' => 100, '阿' => 99, '口' => 99, '线' => 98, '阿' => 98, '马' => 97, '線' => 97, '黄' => 96, '馬' => 96, '参' => 95, '黃' => 95, '般' => 94, '參' => 94, '命' => 93, '般' => 93, '视' => 92, '命' => 92, '观' => 91, '視' => 91,
'联' => 90, '觀' => 90, '脑' => 89, '聯' => 89, '朋' => 88, '腦' => 88, '格' => 87, '朋' => 87, '儿' => 86, '格' => 86, '八' => 85, '兒' => 85, '修' => 84, '八' => 84, '料' => 83, '修' => 83, '钱' => 82, '料' => 82, '失' => 81, '錢' => 81,
'吃' => 80, '失' => 80, '住' => 79, '吃' => 79, '即' => 78, '住' => 78, '另' => 77, '即' => 77, '录' => 76, '另' => 76, '专' => 75, '錄' => 75, '象' => 74, '專' => 74, '换' => 73, '象' => 73, '基' => 72, '換' => 72, '板' => 71, '基' => 71,
'拿' => 70, '板' => 70, '远' => 69, '拿' => 69, '速' => 68, '遠' => 68, '形' => 67, '速' => 67, '孩' => 66, '形' => 66, '备' => 65, '孩' => 65, '歌' => 64, '備' => 64, '帮' => 63, '歌' => 63, '确' => 62, '幫' => 62, '候' => 61, '確' => 61,
'除' => 60, '候' => 60, '界' => 59, '除' => 59, '装' => 58, '界' => 58, '类' => 57, '裝' => 57, '讲' => 56, '類' => 56, '器' => 55, '講' => 55, '南' => 54, '器' => 54, '案' => 53, '南' => 53, '画' => 52, '案' => 52, '英' => 51, '畫' => 51,
'诉' => 50, '英' => 50, '带' => 49, '訴' => 49, '差' => 48, '帶' => 48, '乎' => 47, '差' => 47, '量' => 46, '乎' => 46, '久' => 45, '量' => 45, '掉' => 44, '久' => 44, '似' => 43, '掉' => 43, '整' => 42, '似' => 42, '引' => 41, '整' => 41,
'班' => 40, '引' => 40, '迷' => 39, '班' => 39, '图' => 38, '迷' => 38, '制' => 37, '圖' => 37, '费' => 36, '制' => 36, '赛' => 35, '費' => 35, '奇' => 34, '賽' => 34, '识' => 33, '奇' => 33, '型' => 32, '識' => 32, '超' => 31, '型' => 31,
'边' => 30, '超' => 30, '耶' => 29, '邊' => 29, '品' => 28, '耶' => 28, '舍' => 27, '品' => 27, '虽' => 26, '舍' => 26, '始' => 25, '雖' => 25, '运' => 24, '始' => 24, '李' => 23, '運' => 23, '务' => 22, '李' => 22, '权' => 21, '務' => 21,
'验' => 20, '權' => 20, '故' => 19, '驗' => 19, '六' => 18, '故' => 18, '读' => 17, '六' => 17, '怪' => 16, '讀' => 16, '飞' => 15, '怪' => 15, '满' => 14, '飛' => 14, '服' => 13, '滿' => 13, '梦' => 12, '服' => 12, '收' => 11, '夢' => 11,
'眼' => 10, '收' => 10, '造' => 9, '眼' => 9, '念' => 8, '造' => 8, '留' => 7, '念' => 7, '课' => 6, '留' => 6, '军' => 5, '課' => 5, '破' => 4, '軍' => 4, '精' => 3, '破' => 3, '半' => 2, '精' => 2, '约' => 1, '半' => 1, );
@codes = qw(gb hz big5 utf8 ascii other); # iso8859, SJIS
sub prob_sort {
$codelist{$a} <=> $codelist{$b};
}
sub codeguess {
local($srctxt) = @_;
$codelist{'gb'} = gb_probability($srctxt);
$codelist{'hz'} = hz_probability($srctxt);
$codelist{'big5'} = big5_probability($srctxt);
$codelist{'utf8'} = utf8_probability($srctxt);
$codelist{'ascii'} = ascii_probability($srctxt);
# $codelist{'iso8859'} = iso8859_probability($srctxt);
if ($codelist{'gb'} < 50 and
$codelist{'hz'} < 50 and
$codelist{'big5'} < 55 and
$codelist{'utf8'} < 50 and
$codelist{'ascii'} < 50)
{
$codelist{'other'} = 90;
}
@codes = reverse sort prob_sort @codes;
@codes;
}
# hex2utf8: Take a string of 4 hex digits (0-9A-F) and convert it
# to the corresponding (1, 2, or 3 byte) UTF-8 representation.
sub hex2utf8 {
my($hexchar) = @_;
#print "$hexchar \n";
if ($hexchar !~ m/^0x/) {
$hexchar = "0x" . $hexchar;
}
$binchar = oct($hexchar);
if ($binchar <= 127) {
$retval = pack("C", $binchar);
} elsif ($binchar <= 2047) {
$bin1 = ($binchar >> 6) | 0xC0;
$bin2 = ($binchar & 0x3F) | 0x80;
$retval = pack("C2", $bin1, $bin2);
} else {
$bin1 = ($binchar >> 12) | 0xE0;
$bin2 = (($binchar & 0x0FFF) >> 6) | 0x80;
$bin3 = ($binchar & 0x003F) | 0x80;
$retval = pack("C*", $bin1, $bin2, $bin3);
# #print "in 3 char version with $hexchar and $retval bin1 $bin1 bin2 $bin2 bin3 $bin3\n";
}
$retval;
}
sub utf82ucs {
my($utfstring) = @_;
my($unichar, $unival, $unistring, $i, $int1, $int2, $int3, $byte1, $byte2, $byte3);
$i = 0;
while ($i < length($utfstring)) {
$byte1 = substr($utfstring, $i, 1);
if (unpack("C", $byte1) <= 0x7F) { # 1 byte long (ASCII)
$unichar = pack("C", 0x00) . $byte1;
$i++;
} elsif ((unpack("C", $byte1) & 0xE0) == 0xC0) { # 2 bytes long
$byte2 = substr($utfstring, $i+1, 1);
$int1 = unpack("C", $byte1) & 0x1F;
$int1 <<= 0x06;
$int2 = unpack("C", $byte2) & 0x3F;
$unival = $int1 | $int2;
$unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
$i += 2;
} else { # 3 bytes long
$byte2 = substr($utfstring, $i+1, 1);
$byte3 = substr($utfstring, $i+2, 1);
$int1 = 0x0F & unpack("C", $byte1);
$int1 <<= 12;
$int2 = 0x3F & unpack("C", $byte2);
$int2 <<= 6;
$int3 = 0x3F & unpack("C", $byte3);
$unival = $int1 | $int2 | $int3;
$unichar = pack("CC", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
$i += 3;
}
$unistring .= $unichar;
}
$unistring;
}
sub utf82hex {
my($utfstring) = @_;
my($unichar, $unival, $unistring, $i, $int1, $int2, $int3, $byte1, $byte2, $byte3);
$i = 0;
while ($i < length($utfstring)) {
$byte1 = substr($utfstring, $i, 1);
if (unpack("C", $byte1) <= 0x7F) { # 1 byte long (ASCII)
$unichar = sprintf("00%02x", $byte1);
$i++;
} elsif ((unpack("C", $byte1) & 0xE0) == 0xC0) { # 2 bytes long
$byte2 = substr($utfstring, $i+1, 1);
$int1 = unpack("C", $byte1) & 0x1F;
$int1 <<= 0x06;
$int2 = unpack("C", $byte2) & 0x3F;
$unival = $int1 | $int2;
$unichar = sprintf("%02x%02x", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
$i += 2;
} else { # 3 bytes long
$byte2 = substr($utfstring, $i+1, 1);
$byte3 = substr($utfstring, $i+2, 1);
$int1 = 0x0F & unpack("C", $byte1);
$int1 <<= 12;
$int2 = 0x3F & unpack("C", $byte2);
$int2 <<= 6;
$int3 = 0x3F & unpack("C", $byte3);
$unival = $int1 | $int2 | $int3;
$unichar = sprintf("%02x%02x", (0xFF00 & $unival) >> 8, (0x00FF & $unival));
$i += 3;
}
$unistring .= $unichar;
}
return $unistring;
}
# hex2gb: Take string of four hex digits (0-9A-F) and convert it to
# the corresponding EUC GB character
sub hex2gb {
my($hexchar) = @_;
my $byte1, $byte2, $bv1, $bv2;
$hexchar =~ s/^(0x)?(.*)/$2/i;
$byte1 = "0x" . substr($hexchar, 0, 2);
$bv1 = oct($byte1);
if ($bv1 <= 0x7F) {
$bv1 += 0x80;
}
$byte2 = "0x" . substr($hexchar, 2, 2);
$bv2 = oct($byte2);
if ($bv2 <= 0x7F) {
$bv2 += 0x80;
}
$char = pack("C2", $bv1, $bv2);
$char;
}
# hex2big5: Take string of four hex digits (0-9A-F) and convert it to
# the corresponding 2 byte Big5 character
# Probably an easier way to do this using pack
sub hex2big5 {
my($hexchar) = @_;
my $byte1, $byte2, $bv1, $bv2;
$hexchar =~ s/^(0x)?(.*)/$2/i;
$byte1 = "0x" . substr($hexchar, 0, 2);
$bv1 = oct($byte1);
$byte2 = "0x" . substr($hexchar, 2, 2);
$bv2 = oct($byte2);
$char = pack("C2", $bv1, $bv2);
$char;
}
sub bytes2hex {
my($twobytes) = @_;
my $hex1, $hex2, $allhex;
$hex1 = unpack "H2", substr($twobytes, 0, 1);
$hex2 = unpack "H2", substr($twobytes, 1, 1);
$allhex = "0x\U$hex1$hex2\E";
}
# Take a given string and convert any Hz sequences in it to the
# corresponding GB sequence
sub hz2gb {
my($hzline) = @_;
my($gbline) = "";
my($hzlen) = length($hzline);
my($i, $hzval1, $hzval2, $hzval);
for ($i = 0; $i < $hzlen; $i++) {
if (substr($hzline, $i, 1) eq "~") {
if (substr($hzline, $i+1, 1) eq "{") {
$i += 2;
while ($i < $hzlen) {
if (substr($hzline, $i, 2) eq "~}") {
$i++;
last;
} elsif (substr($hzline, $i, 1) eq "\n" or
substr($hzline, $i, 1) eq "\r") {
$gbline .= substr($hzline, $i, 1);
last;
}
$hzval1 = vec($hzline, $i, 8) + 0x80;
$hzval2 = vec($hzline, $i+1, 8) + 0x80;
$hzval = pack("C2", $hzval1, $hzval2);
$gbline .= $hzval;
$i += 2;
}
} elsif (substr($hzline, $i+1, 1) eq "~") { # ~~ becomes ~
$gbline .= "~";
} else { # false alarm
$gbline .= substr($hzline, $i, 1);
}
} else {
$gbline .= substr($hzline, $i, 1);
}
}
return $gbline;
}
# Take a string containing GB characters and convert it to the
# corresponding Hz encoded string. Adjacent GB characters will
# all be included in the the Hz escape sequences (only one "~{" )
sub gb2hz {
my($gbline) = @_;
my($hzline) = "";
$gblen = length($gbline);
for ($i = 0; $i < $gblen; $i++) {
if (vec($gbline, $i, 8) > 127) {
$hzline .= "~{";
while ($i < $gblen) {
if (vec($gbline, $i, 8) < 128) {
$hzline .= "~}" . substr($gbline, $i, 1);
last;
} elsif (substr($gbline, $i, 1) eq "\n" or
substr($gbline, $i, 1) eq "\r") {
$hzline .= "~}" . substr($gbline, $i, 1);
last;
}
$gbval1 = vec($gbline, $i, 8) - 0x80;
$gbval2 = vec($gbline, $i+1, 8) - 0x80;
$gbval = pack("C2", $gbval1, $gbval2);
$hzline .= $gbval;
$i += 2;
}
} else {
if (substr($gbline, $i, 1) eq "~") {
$hzline .= "~~"; # ~ must be escaped
} else {
$hzline .= substr($gbline, $i, 1);
}
}
}
return $hzline;
}
sub gb_probability {
local($srctxt) = @_;
$asciichars = 0;
$dbchars = 1;
$gbchars = 0;
# Stage 1: Check to see if characters fit into acceptable ranges
$strlen = length($srctxt);
for ($i = 0; $i < $strlen; $i++) {
$binchar = vec($srctxt, $i, 8);
if ($binchar <= 127) {
$asciichars++;
} else {
$dbchars++;
$binchar2 = vec($srctxt, $i+1, 8);
if ($binchar >= 160 and $binchar <= 247 and
$binchar2 >= 160 and $binchar2 <= 254) {
$gbchars++;
}
$i++;
}
}
$rangeval = 50 * ($gbchars/$dbchars);
# Stage 2 : Check to see if frequency count accords with expected amount
$gbfreq = 0;
$totalfreq = 1;
for ($i = 0; $i < $strlen; $i++) {
$binchar = vec($srctxt, $i, 8);
if ($binchar <= 127) {
} else {
$gbchar = substr($srctxt, $i, 2);
if ($binchar >= 176 and $binchar <= 247) {
$totalfreq += 500;
$gbfreq += $gbhash{$gbchar};
}
$i++;
}
}
$freqval = 50 * ($gbfreq/$totalfreq);
#print "gbfreq is $gbfreq and totalfreq is $totalfreq\n";
#print "rangeval is $rangeval, freqval is $freqval\n";
$prob_val = $rangeval + $freqval;
$prob_val;
}
sub hz_probability {
local($srctxt) = @_;
$asciichars = 0;
$hzcharstart = 0;
$hzcharend = 0;
# Stage 1: Check to see if any Hz escape sequences, ~{ ~}, are found
$strlen = length($srctxt);
for ($i = 0; $i < $strlen; $i++) {
$hzchar = substr($srctxt, $i, 1);
$hzchar2 = substr($srctxt, $i+1, 1);
if ($hzchar eq "~") {
if ($hzchar2 eq "{") {
$hzcharstart++;
} elsif ($hzchar2 eq "}") {
$hzcharend++;
}
}
}
if ($hzcharstart > 4) {
$rangeval = 50;
} elsif ($hzcharstart > 1) {
$rangeval = 41;
} elsif ($hzcharstart > 0) { # Only 39 in case the sequence happened to occur
$rangeval = 39; # in otherwise non-Hz text
} else {
$rangeval = 0;
}
# Stage 2 : Check to see if frequency count accords with expected amount
$hzfreq = 0;
$totalfreq = 1;
for ($i = 0; $i < $strlen; $i++) {
$hzchar = substr($srctxt, $i, 2);
if ($hzchar eq "~{") {
$i += 2;
$done = 0;
for (; $done != 1; $i+=2) {
$hz1 = vec($srctxt, $i, 8);
$hz2 = vec($srctxt, $i+1, 8);
if ($hz1 == 0x0A or $hz1 == 0x0D) { # End of line
last;
} elsif ($hz1 == 0x7E and $hz2 == 0x7D) { # "~}" found
last;
}
if ($hz1 < 0x80) { # Some docs have GB chars in Hz escapes
$hz1 += 0x80; # These checks prevent misunderstanding these
}
if ($hz2 < 0x80) {
$hz2 += 0x80;
}
$gbchar = pack("C2", $hz1, $hz2);
if ($hz1 >= 176 and $hz1 <= 247) {
$totalfreq += 500;
$hzfreq += $gbhash{$gbchar};
}
}
}
}
$freqval = 50 * ($hzfreq/$totalfreq);
if ($freqval > 10) {
$freqval = 41;
}
#print "hzfreq is $hzfreq and totalfreq is $totalfreq\n";
#print "rangeval is $rangeval, freqval is $freqval\n";
$prob_val = $rangeval + $freqval;
$prob_val;
}
sub big5_probability {
local($srctxt) = @_;
$asciichars = 0;
$dbchars = 1;
$b5chars = 0;
# Stage 1: Check to see if characters fit into acceptable ranges
$strlen = length($srctxt);
for ($i = 0; $i < $strlen; $i++) {
$binchar = vec($srctxt, $i, 8);
if ($binchar <= 127) {
$asciichars++;
} else {
$dbchars++;
$binchar2 = vec($srctxt, $i+1, 8);
if ($binchar >= 0xA1 and $binchar <= 0xF9 and
$binchar2 >= 0x40 and $binchar2 <= 0xFE) {
$b5chars++;
}
$i++;
}
}
$rangeval = 50 * ($b5chars/$dbchars);
# Stage 2 : Check to see if frequency count accords with expected amount
$b5freq = 0;
$totalfreq = 1;
for ($i = 0; $i < $strlen; $i++) {
$binchar = vec($srctxt, $i, 8);
if ($binchar <= 127) {
} else {
$b5char = substr($srctxt, $i, 2);
if ($binchar >= 164 and $binchar <= 249) {
$totalfreq += 500;
$b5freq += $b5hash{$b5char};
}
$i++;
}
}
$freqval = 50 * ($b5freq/$totalfreq);
#print "b5freq is $b5freq and totalfreq is $totalfreq\n";
#print "rangeval is $rangeval, freqval is $freqval\n";
$prob_val = $rangeval + $freqval;
$prob_val;
}
sub utf8_probability {
local($srctxt) = @_;
$asciichars = 0;
$dbchars = 1;
$utf8chars = 0;
# Stage 1: Check to see if characters fit into acceptable ranges
$strlen = length($srctxt);
for ($i = 0; $i < $strlen; $i++) {
$binchar = vec($srctxt, $i, 8);
if ($binchar <= 127) {
$asciichars++;
} else {
$dbchars++;
$binchar2 = vec($srctxt, $i+1, 8);
$binchar3 = vec($srctxt, $i+2, 8);
# print "In UTF8 ext ascii $binchar\n";
if ($binchar >= 0xC0 and $binchar <= 0xDF) {
# print "In UTF8 ext ascii $binchar\n";
if ($binchar2 >= 0x80 and $binchar2 <= 0xBF) {
$utf8chars++;
$i++;
}
} elsif ($binchar >= 0xE0 and $binchar <= 0xEF) {
# print "In UTF8 ext ascii $binchar\n";
if ($binchar2 >= 0x80 and $binchar2 <= 0xBF) {
if ($binchar2 >= 0x80 and $binchar2 <= 0xBF) {
$utf8chars++;
$i += 2;
}
}
}
}
}
$rangeval = 50 * ($utf8chars/$dbchars);
# Stage 2 : Check to see if frequency count accords with expected amount
$utf8freq = 0;
$totalfreq = 1;
for ($i = 0; $i < $strlen; $i++) {
$binchar = vec($srctxt, $i, 8);
if ($binchar <= 127) {
} else {
$binchar2 = vec($srctxt, $i+1, 8);
$binchar3 = vec($srctxt, $i+2, 8);
if ($binchar >= 0xC0 and $binchar <= 0xDF) {
$totalfreq += 500;
if ($binchar2 >= 0x80 and $binchar2 <= 0xBF) {
$utf8char = substr($srctxt, $i, 2);
$utf8freq += $utf8hash{$utf8char};
$i++;
}
} elsif ($binchar >= 0xE0 and $binchar <= 0xEF) {
$totalfreq += 500;
if ($binchar2 >= 0x80 and $binchar2 <= 0xBF) {
if ($binchar3 >= 0x80 and $binchar3 <= 0xBF) {
$utf8char = substr($srctxt, $i, 3);
$utf8freq += $utf8hash{$utf8char};
$i += 2;
}
}
}
}
}
$freqval = 50 * ($utf8freq/$totalfreq);
#print "utf8freq is $utf8freq and totalfreq is $totalfreq\n";
#print "rangeval is $rangeval, freqval is $freqval\n";
$prob_val = $rangeval + $freqval;
$prob_val;
}
# Pretty simple: if it has any bytes with high bit set, it's
# not pure ascii
sub ascii_probability {
local($srctxt) = @_;
$ascii = 0;
$total_chars = 0;
$strlen = length($srctxt);
for ($i = 0; $i < $strlen; $i++) {
if (vec($srctxt, $i, 8) < 128) {
$ascii++;
$total_chars++;
} else {
$total_chars++;
}
}
if ($total_chars != $ascii) {
$probval = 0;
} else {
$probval = 90;
}
$probval;
}