Skip to content

Instantly share code, notes, and snippets.

@vireshas
Last active August 29, 2015 13:57
Show Gist options
  • Save vireshas/9499216 to your computer and use it in GitHub Desktop.
Save vireshas/9499216 to your computer and use it in GitHub Desktop.
Detect no,of utf-8 chars in a line
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//utf-8 chars can be 4bytes long
#define U_MAX_BYTES 4
//counter to hold non ascii chars count
int count = 0;
int u_getc(FILE *stream, char *bytes) {
/*
utf chars start with 0,10,110,1110 which describes
the length of utf8 char
the following masks are to detect the length of utf8 char
*/
int mask[] = {192, 224, 240};
unsigned short i, j;
memset(bytes, 0, U_MAX_BYTES + 1);
bytes[0] = getc(stream);
if (bytes[0] == EOF) {
return 0;
}
//check the length of utf8 char
i = 0;
if ((bytes[0] & mask[0]) == mask[0]) i++;
if ((bytes[0] & mask[1]) == mask[1]) i++;
if ((bytes[0] & mask[2]) == mask[2]) i++;
//if i is not 0 then its a non ascii char
//ascii chars start with 0
if(i > 0) count++;
j = 0;
while (j < i) {
j++;
bytes[j] = getc(stream);
}
return i + 1;
}
int main(int argc, char *argv[]) {
char bytes[200];
while (u_getc(stdin, bytes)) {
//print non ascii count if we encounter a \n or EOF
if((strcmp(bytes,"\n") == 0)){
printf("%d\n", count);
count = 0;
}
}
if(count > 0) printf("%d", count);
return 0;
}
x√ab c
3 ∏x
abc cd
I ♥ haskell, ♫ & ♘
ɖ & Ɛ
€€€
1
1
0
3
2
3
@vireshas
Copy link
Author

gcc detect_utf8.c -Wall -lm
./a.out < input > output

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment