Last active
November 5, 2017 13:15
-
-
Save hikui/9408758 to your computer and use it in GitHub Desktop.
如果在GB18030编码的文字中,有几个不合法的字节。本类可以将这些不合法的字节变成“?”,防止转NSString时出现nil。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// NSData+HG_DataHealing.h | |
// HGDataHealing | |
// | |
// Created by 缪 和光 on 14-3-7. | |
// Copyright (c) 2014年 Hokuang. All rights reserved. | |
// | |
#import <Foundation/Foundation.h> | |
@interface NSData (HG_DataHealing) | |
- (NSString *)GB18030String; | |
@end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// NSData+HG_DataHealing.m | |
// HGDataHealing | |
// | |
// Created by 缪 和光 on 14-3-7. | |
// Copyright (c) 2014年 Hokuang. All rights reserved. | |
// | |
#import "NSData+HG_DataHealing.h" | |
@implementation NSData (HG_DataHealing) | |
- (NSString *)GB18030String { | |
NSStringEncoding enc = CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingGB_18030_2000); | |
NSString *str = [[NSString alloc]initWithData:self encoding:enc]; | |
if (!str) { | |
str = [[NSString alloc]initWithData:[self dataByHealingGB18030Stream] encoding:enc]; | |
} | |
return str; | |
} | |
- (NSData *)dataByHealingGB18030Stream { | |
NSUInteger length = [self length]; | |
if (length == 0) { | |
return self; | |
} | |
static NSString * replacementCharacter = @"?"; | |
NSStringEncoding enc = CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingGB_18030_2000); | |
NSData *replacementCharacterData = [replacementCharacter dataUsingEncoding:enc]; | |
NSMutableData *resultData = [NSMutableData dataWithCapacity:self.length]; | |
const Byte *bytes = [self bytes]; | |
static const NSUInteger bufferMaxSize = 1024; | |
Byte buffer[bufferMaxSize]; | |
NSUInteger bufferIndex = 0; | |
NSUInteger byteIndex = 0; | |
BOOL invalidByte = NO; | |
#define FlushBuffer() if (bufferIndex > 0) { \ | |
[resultData appendBytes:buffer length:bufferIndex]; \ | |
bufferIndex = 0; \ | |
} | |
#define CheckBuffer() if ((bufferIndex+5) >= bufferMaxSize) { \ | |
[resultData appendBytes:buffer length:bufferIndex]; \ | |
bufferIndex = 0; \ | |
} | |
while (byteIndex < length) { | |
Byte byte = bytes[byteIndex]; | |
//检查第一位 | |
if (byte >= 0 && byte <= (Byte)0x7f) { | |
//单字节文字 | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
} else if (byte >= (Byte)0x81 && byte <= (Byte)0xfe){ | |
//可能是双字节,可能是四字节 | |
if (byteIndex + 1 >= length) { | |
//这是最后一个字节了,但是这个字节表明后面应该还有1或3个字节,那么这个字节一定是错误字节 | |
FlushBuffer(); | |
return resultData; | |
} | |
Byte byte2 = bytes[++byteIndex]; | |
if (byte2 >= (Byte)0x40 && byte <= (Byte)0xfe && byte != (Byte)0x7f) { | |
//是双字节,并且可能合法 | |
Byte tuple[] = {byte, byte2}; | |
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 2, kCFStringEncodingGB_18030_2000, false); | |
if (cfstr) { | |
CFRelease(cfstr); | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
} else { | |
//这个双字节字符不合法,但byte2可能是下一字符的第一字节 | |
byteIndex -= 1; | |
invalidByte = YES; | |
} | |
} else if (byte2 >= (Byte)0x30 && byte2 <= (Byte)0x39) { | |
//可能是四字节 | |
if (byteIndex + 2 >= length) { | |
FlushBuffer(); | |
return resultData; | |
} | |
Byte byte3 = bytes[++byteIndex]; | |
if (byte3 >= (Byte)0x81 && byte3 <= (Byte)0xfe) { | |
// 第三位合法,判断第四位 | |
Byte byte4 = bytes[++byteIndex]; | |
if (byte4 >= (Byte)0x30 && byte4 <= (Byte)0x39) { | |
//第四位可能合法 | |
Byte tuple[] = {byte, byte2, byte3, byte4}; | |
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 4, kCFStringEncodingGB_18030_2000, false); | |
if (cfstr) { | |
CFRelease(cfstr); | |
CheckBuffer(); | |
buffer[bufferIndex++] = byte; | |
buffer[bufferIndex++] = byte2; | |
buffer[bufferIndex++] = byte3; | |
buffer[bufferIndex++] = byte4; | |
} else { | |
//这个四字节字符不合法,但是byte2可能是下一个合法字符的第一字节,回退3位 | |
//并且将byte1,byte2用?替代 | |
byteIndex -= 3; | |
invalidByte = YES; | |
} | |
} else { | |
//第四字节不合法 | |
byteIndex -= 3; | |
invalidByte = YES; | |
} | |
} else { | |
// 第三字节不合法 | |
byteIndex -= 2; | |
invalidByte = YES; | |
} | |
} else { | |
// 第二字节不是合法的第二位,但可能是下一个合法的第一位,所以回退一个byte | |
invalidByte = YES; | |
byteIndex -= 1; | |
} | |
if (invalidByte) { | |
invalidByte = NO; | |
FlushBuffer(); | |
[resultData appendData:replacementCharacterData]; | |
} | |
} | |
byteIndex++; | |
} | |
FlushBuffer(); | |
return resultData; | |
} | |
@end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment