Skip to content

Instantly share code, notes, and snippets.

@oleganza
Last active December 9, 2022 03:50
Show Gist options
  • Save oleganza/997155 to your computer and use it in GitHub Desktop.
Save oleganza/997155 to your computer and use it in GitHub Desktop.
Creating NSString from NSData by fixing invalid UTF8 characters
// Author: Oleg Andreev <[email protected]>
// May 28, 2011
// Do What The Fuck You Want Public License <http://www.wtfpl.net>
#import "NSData+OADataHelpers.h"
#if !__has_feature(objc_arc)
#error ARC must be enabled!
#endif
@implementation NSData (OADataHelpers)
- (NSString*) UTF8String
{
// First we try strict decoding to avoid iconv overhead when not needed (majority of cases).
NSString* str = [[NSString alloc] initWithData:self encoding:NSUTF8StringEncoding];
if (!str)
{
// Here data contains invalid characters, so we'll try to clean them up.
return [[NSString alloc] initWithData:[self dataByHealingUTF8Stream] encoding:NSUTF8StringEncoding];
}
return str;
}
- (NSData*) dataByHealingUTF8Stream
{
NSUInteger length = [self length];
if (length == 0) return self;
// Replaces all broken sequences by � character and returns NSData with valid UTF-8 bytes.
#if DEBUG
int warningsCounter = 10;
#endif
// bits
// 7 U+007F 0xxxxxxx
// 11 U+07FF 110xxxxx 10xxxxxx
// 16 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
// 21 U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// 26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// 31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
#define b00000000 0x00
#define b10000000 0x80
#define b11000000 0xc0
#define b11100000 0xe0
#define b11110000 0xf0
#define b11111000 0xf8
#define b11111100 0xfc
#define b11111110 0xfe
static NSString* replacementCharacter = @"�";
NSData* replacementCharacterData = [replacementCharacter dataUsingEncoding:NSUTF8StringEncoding];
NSMutableData* resultData = [NSMutableData dataWithCapacity:[self length]];
const char *bytes = [self bytes];
static const NSUInteger bufferMaxSize = 1024;
char buffer[bufferMaxSize]; // not initialized, but will be filled in completely before copying to resultData
NSUInteger bufferIndex = 0;
#define FlushBuffer() if (bufferIndex > 0) { \
[resultData appendBytes:buffer length:bufferIndex]; \
bufferIndex = 0; \
}
#define CheckBuffer() if ((bufferIndex+5) >= bufferMaxSize) { \
[resultData appendBytes:buffer length:bufferIndex]; \
bufferIndex = 0; \
}
NSUInteger byteIndex = 0;
BOOL invalidByte = NO;
while (byteIndex < length)
{
char byte = bytes[byteIndex];
// ASCII character is always a UTF-8 character
if ((byte & b10000000) == b00000000) // 0xxxxxxx
{
CheckBuffer();
buffer[bufferIndex++] = byte;
}
else if ((byte & b11100000) == b11000000) // 110xxxxx 10xxxxxx
{
if (byteIndex+1 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000)
{
// This 2-byte character still can be invalid. Check if we can create a string with it.
unsigned char tuple[] = {byte, byte2};
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 2, kCFStringEncodingUTF8, false);
if (cfstr)
{
CFRelease(cfstr);
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
}
else
{
invalidByte = YES;
}
}
else
{
invalidByte = YES;
}
}
else if ((byte & b11110000) == b11100000) // 1110xxxx 10xxxxxx 10xxxxxx
{
if (byteIndex+2 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
char byte3 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000 &&
(byte3 & b11000000) == b10000000)
{
// This 3-byte character still can be invalid. Check if we can create a string with it.
unsigned char tuple[] = {byte, byte2, byte3};
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 3, kCFStringEncodingUTF8, false);
if (cfstr)
{
CFRelease(cfstr);
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
}
else
{
invalidByte = YES;
}
}
else
{
invalidByte = YES;
}
}
else if ((byte & b11111000) == b11110000) // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
{
if (byteIndex+3 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
char byte3 = bytes[++byteIndex];
char byte4 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000 &&
(byte3 & b11000000) == b10000000 &&
(byte4 & b11000000) == b10000000)
{
// This 4-byte character still can be invalid. Check if we can create a string with it.
unsigned char tuple[] = {byte, byte2, byte3, byte4};
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 4, kCFStringEncodingUTF8, false);
if (cfstr)
{
CFRelease(cfstr);
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
buffer[bufferIndex++] = byte4;
}
else
{
invalidByte = YES;
}
}
else
{
invalidByte = YES;
}
}
else if ((byte & b11111100) == b11111000) // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
{
if (byteIndex+4 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
char byte3 = bytes[++byteIndex];
char byte4 = bytes[++byteIndex];
char byte5 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000 &&
(byte3 & b11000000) == b10000000 &&
(byte4 & b11000000) == b10000000 &&
(byte5 & b11000000) == b10000000)
{
// This 5-byte character still can be invalid. Check if we can create a string with it.
unsigned char tuple[] = {byte, byte2, byte3, byte4, byte5};
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 5, kCFStringEncodingUTF8, false);
if (cfstr)
{
CFRelease(cfstr);
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
buffer[bufferIndex++] = byte4;
buffer[bufferIndex++] = byte5;
}
else
{
invalidByte = YES;
}
}
else
{
invalidByte = YES;
}
}
else if ((byte & b11111110) == b11111100) // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
{
if (byteIndex+5 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
char byte3 = bytes[++byteIndex];
char byte4 = bytes[++byteIndex];
char byte5 = bytes[++byteIndex];
char byte6 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000 &&
(byte3 & b11000000) == b10000000 &&
(byte4 & b11000000) == b10000000 &&
(byte5 & b11000000) == b10000000 &&
(byte6 & b11000000) == b10000000)
{
// This 6-byte character still can be invalid. Check if we can create a string with it.
unsigned char tuple[] = {byte, byte2, byte3, byte4, byte5, byte6};
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 6, kCFStringEncodingUTF8, false);
if (cfstr)
{
CFRelease(cfstr);
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
buffer[bufferIndex++] = byte4;
buffer[bufferIndex++] = byte5;
buffer[bufferIndex++] = byte6;
}
else
{
invalidByte = YES;
}
}
else
{
invalidByte = YES;
}
}
else
{
invalidByte = YES;
}
if (invalidByte)
{
#if DEBUG
if (warningsCounter)
{
warningsCounter--;
//NSLog(@"NSData dataByHealingUTF8Stream: broken byte encountered at index %d", byteIndex);
}
#endif
invalidByte = NO;
FlushBuffer();
[resultData appendData:replacementCharacterData];
}
byteIndex++;
}
FlushBuffer();
return resultData;
}
@end
@cherpake
Copy link

cherpake commented Feb 4, 2013

Hi,

I noticed a case when UTF8 string is broken by 2 bytes UTF8 character being 1 byte long, for example only b11100000 character appears and then a valid ASCII character appears after it ( " ), using your code with small modifications i was able to solve this issue, and parse the JSON successfully.

If you interested take a look at modifications i made: https://gist.github.com/4709652

-Evgeny

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment