Last active
August 29, 2015 13:57
-
-
Save vireshas/9499216 to your computer and use it in GitHub Desktop.
Detect no,of utf-8 chars in a line
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
//utf-8 chars can be 4bytes long | |
#define U_MAX_BYTES 4 | |
//counter to hold non ascii chars count | |
int count = 0; | |
int u_getc(FILE *stream, char *bytes) { | |
/* | |
utf chars start with 0,10,110,1110 which describes | |
the length of utf8 char | |
the following masks are to detect the length of utf8 char | |
*/ | |
int mask[] = {192, 224, 240}; | |
unsigned short i, j; | |
memset(bytes, 0, U_MAX_BYTES + 1); | |
bytes[0] = getc(stream); | |
if (bytes[0] == EOF) { | |
return 0; | |
} | |
//check the length of utf8 char | |
i = 0; | |
if ((bytes[0] & mask[0]) == mask[0]) i++; | |
if ((bytes[0] & mask[1]) == mask[1]) i++; | |
if ((bytes[0] & mask[2]) == mask[2]) i++; | |
//if i is not 0 then its a non ascii char | |
//ascii chars start with 0 | |
if(i > 0) count++; | |
j = 0; | |
while (j < i) { | |
j++; | |
bytes[j] = getc(stream); | |
} | |
return i + 1; | |
} | |
int main(int argc, char *argv[]) { | |
char bytes[200]; | |
while (u_getc(stdin, bytes)) { | |
//print non ascii count if we encounter a \n or EOF | |
if((strcmp(bytes,"\n") == 0)){ | |
printf("%d\n", count); | |
count = 0; | |
} | |
} | |
if(count > 0) printf("%d", count); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
x√ab c | |
3 ∏x | |
abc cd | |
I ♥ haskell, ♫ & ♘ | |
ɖ & Ɛ | |
€€€ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 | |
1 | |
0 | |
3 | |
2 | |
3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
gcc detect_utf8.c -Wall -lm
./a.out < input > output