Last active
December 9, 2022 03:50
-
-
Save oleganza/997155 to your computer and use it in GitHub Desktop.
Creating NSString from NSData by fixing invalid UTF8 characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Author: Oleg Andreev <[email protected]> | |
// May 28, 2011 | |
// Do What The Fuck You Want Public License <http://www.wtfpl.net> | |
#import "NSData+OADataHelpers.h" | |
#if !__has_feature(objc_arc) | |
#error ARC must be enabled! | |
#endif | |
@implementation NSData (OADataHelpers) | |
- (NSString*) UTF8String | |
{ | |
// First we try strict decoding to avoid iconv overhead when not needed (majority of cases). | |
NSString* str = [[NSString alloc] initWithData:self encoding:NSUTF8StringEncoding]; | |
if (!str) | |
{ | |
// Here data contains invalid characters, so we'll try to clean them up. | |
return [[NSString alloc] initWithData:[self dataByHealingUTF8Stream] encoding:NSUTF8StringEncoding]; | |
} | |
return str; | |
} | |
- (NSData*) dataByHealingUTF8Stream | |
{ | |
NSUInteger length = [self length]; | |
if (length == 0) return self; | |
// Replaces all broken sequences by � character and returns NSData with valid UTF-8 bytes. | |
#if DEBUG | |
int warningsCounter = 10; | |
#endif | |
// bits | |
// 7 U+007F 0xxxxxxx | |
// 11 U+07FF 110xxxxx 10xxxxxx | |
// 16 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx | |
// 21 U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
// 26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
// 31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
#define b00000000 0x00 | |
#define b10000000 0x80 | |
#define b11000000 0xc0 | |
#define b11100000 0xe0 | |
#define b11110000 0xf0 | |
#define b11111000 0xf8 | |
#define b11111100 0xfc | |
#define b11111110 0xfe | |
static NSString* replacementCharacter = @"�"; | |
NSData* replacementCharacterData = [replacementCharacter dataUsingEncoding:NSUTF8StringEncoding]; | |
NSMutableData* resultData = [NSMutableData dataWithCapacity:[self length]]; | |
const char *bytes = [self bytes]; | |
static const NSUInteger bufferMaxSize = 1024; | |
char buffer[bufferMaxSize]; // not initialized, but will be filled in completely before copying to resultData | |
NSUInteger bufferIndex = 0; | |
#define FlushBuffer() if (bufferIndex > 0) { \ | |
[resultData appendBytes:buffer length:bufferIndex]; \ | |
bufferIndex = 0; \ | |
} | |
#define CheckBuffer() if ((bufferIndex+5) >= bufferMaxSize) { \ | |
[resultData appendBytes:buffer length:bufferIndex]; \ | |
bufferIndex = 0; \ | |
} | |
NSUInteger byteIndex = 0; | |
BOOL invalidByte = NO; | |
while (byteIndex < length) | |
{ | |
char byte = bytes[byteIndex]; | |
// ASCII character is always a UTF-8 character | |
if ((byte & b10000000) == b00000000) // 0xxxxxxx | |
{ | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
} | |
else if ((byte & b11100000) == b11000000) // 110xxxxx 10xxxxxx | |
{ | |
if (byteIndex+1 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000) | |
{ | |
// This 2-byte character still can be invalid. Check if we can create a string with it. | |
unsigned char tuple[] = {byte, byte2}; | |
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 2, kCFStringEncodingUTF8, false); | |
if (cfstr) | |
{ | |
CFRelease(cfstr); | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else if ((byte & b11110000) == b11100000) // 1110xxxx 10xxxxxx 10xxxxxx | |
{ | |
if (byteIndex+2 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
char byte3 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000 && | |
(byte3 & b11000000) == b10000000) | |
{ | |
// This 3-byte character still can be invalid. Check if we can create a string with it. | |
unsigned char tuple[] = {byte, byte2, byte3}; | |
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 3, kCFStringEncodingUTF8, false); | |
if (cfstr) | |
{ | |
CFRelease(cfstr); | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
buffer[bufferIndex++] = byte3; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else if ((byte & b11111000) == b11110000) // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | |
{ | |
if (byteIndex+3 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
char byte3 = bytes[++byteIndex]; | |
char byte4 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000 && | |
(byte3 & b11000000) == b10000000 && | |
(byte4 & b11000000) == b10000000) | |
{ | |
// This 4-byte character still can be invalid. Check if we can create a string with it. | |
unsigned char tuple[] = {byte, byte2, byte3, byte4}; | |
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 4, kCFStringEncodingUTF8, false); | |
if (cfstr) | |
{ | |
CFRelease(cfstr); | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
buffer[bufferIndex++] = byte3; | |
buffer[bufferIndex++] = byte4; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else if ((byte & b11111100) == b11111000) // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
{ | |
if (byteIndex+4 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
char byte3 = bytes[++byteIndex]; | |
char byte4 = bytes[++byteIndex]; | |
char byte5 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000 && | |
(byte3 & b11000000) == b10000000 && | |
(byte4 & b11000000) == b10000000 && | |
(byte5 & b11000000) == b10000000) | |
{ | |
// This 5-byte character still can be invalid. Check if we can create a string with it. | |
unsigned char tuple[] = {byte, byte2, byte3, byte4, byte5}; | |
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 5, kCFStringEncodingUTF8, false); | |
if (cfstr) | |
{ | |
CFRelease(cfstr); | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
buffer[bufferIndex++] = byte3; | |
buffer[bufferIndex++] = byte4; | |
buffer[bufferIndex++] = byte5; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else if ((byte & b11111110) == b11111100) // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx | |
{ | |
if (byteIndex+5 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
char byte2 = bytes[++byteIndex]; | |
char byte3 = bytes[++byteIndex]; | |
char byte4 = bytes[++byteIndex]; | |
char byte5 = bytes[++byteIndex]; | |
char byte6 = bytes[++byteIndex]; | |
if ((byte2 & b11000000) == b10000000 && | |
(byte3 & b11000000) == b10000000 && | |
(byte4 & b11000000) == b10000000 && | |
(byte5 & b11000000) == b10000000 && | |
(byte6 & b11000000) == b10000000) | |
{ | |
// This 6-byte character still can be invalid. Check if we can create a string with it. | |
unsigned char tuple[] = {byte, byte2, byte3, byte4, byte5, byte6}; | |
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 6, kCFStringEncodingUTF8, false); | |
if (cfstr) | |
{ | |
CFRelease(cfstr); | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
buffer[bufferIndex++] = byte3; | |
buffer[bufferIndex++] = byte4; | |
buffer[bufferIndex++] = byte5; | |
buffer[bufferIndex++] = byte6; | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
} | |
else | |
{ | |
invalidByte = YES; | |
} | |
if (invalidByte) | |
{ | |
#if DEBUG | |
if (warningsCounter) | |
{ | |
warningsCounter--; | |
//NSLog(@"NSData dataByHealingUTF8Stream: broken byte encountered at index %d", byteIndex); | |
} | |
#endif | |
invalidByte = NO; | |
FlushBuffer(); | |
[resultData appendData:replacementCharacterData]; | |
} | |
byteIndex++; | |
} | |
FlushBuffer(); | |
return resultData; | |
} | |
@end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
I noticed a case when UTF8 string is broken by 2 bytes UTF8 character being 1 byte long, for example only b11100000 character appears and then a valid ASCII character appears after it ( " ), using your code with small modifications i was able to solve this issue, and parse the JSON successfully.
If you interested take a look at modifications i made: https://gist.github.com/4709652
-Evgeny