taken from libzip's _zip_guess_encoding
.
zip_encoding_type_t
117 _zip_guess_encoding(zip_string_t *str, zip_encoding_type_t expected_encoding)
118 {
119 zip_encoding_type_t enc;
120 const zip_uint8_t *name;
121 zip_uint32_t i, j, ulen;
122
123 if (str == NULL)
124 return ZIP_ENCODING_ASCII;
125
126 name = str->raw;
127
128 if (str->encoding != ZIP_ENCODING_UNKNOWN)
129 enc = str->encoding;
130 else {
131 enc = ZIP_ENCODING_ASCII;
132 for (i=0; i<str->length; i++) {
133 if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
134 continue;
135
136 enc = ZIP_ENCODING_UTF8_GUESSED;
137 if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
138 ulen = 1;
139 else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
140 ulen = 2;
141 else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
142 ulen = 3;
143 else {
144 enc = ZIP_ENCODING_CP437;
145 break;
146 }
147
148 if (i + ulen >= str->length) {
149 enc = ZIP_ENCODING_CP437;
150 break;
151 }
152
153 for (j=1; j<=ulen; j++) {
154 if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
155 enc = ZIP_ENCODING_CP437;
156 goto done;
157 }
158 }
159 i += ulen;
160 }
161 }
162
163 done:
164 str->encoding = enc;
165
166 if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
167 if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
168 str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
169
170 if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
171 return ZIP_ENCODING_ERROR;
172 }
173
174 return enc;
175 }
176