Created
March 27, 2012 20:18
-
-
Save AlanQuatermain/2219930 to your computer and use it in GitHub Desktop.
Setting up rulesets for line-breaking Japanese text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+ (void) initialize | |
{ | |
if ( self != [KHLineBreaker class] ) | |
return; | |
typedef BOOL (^charsetClassMatchingRule)(NSCharacterSet * charset, NSAttributedString * attrStr, NSUInteger index); | |
charsetClassMatchingRule inCharset = ^(NSCharacterSet * charset, NSAttributedString * attrStr, NSUInteger index) { | |
return ( [charset characterIsMember: [[attrStr string] characterAtIndex: index]] ); | |
}; | |
BOOL (^isOrnamentedCharacterComplex)(NSAttributedString*, NSUInteger) = ^BOOL(NSAttributedString * attrStr, NSUInteger index) { | |
// match if current or following character is set super or subscript | |
NSDictionary * attrsOn = [attrStr attributesAtIndex: index effectiveRange: NULL]; | |
if ( [[attrsOn objectForKey: NSSuperscriptAttributeName] intValue] != 0 ) | |
return ( YES ); | |
if ( [[[attrStr attributesAtIndex: index+1 effectiveRange: NULL] objectForKey: NSSuperscriptAttributeName] intValue] != 0 ) | |
return ( YES ); | |
return ( NO ); | |
}; | |
typedef BOOL (^attributeMatchingRule)(NSString * name, id value, NSAttributedString * attrStr, NSUInteger index); | |
attributeMatchingRule hasAttribute = ^BOOL(NSString * name, id value, NSAttributedString * attrStr, NSUInteger index) { | |
id found = [[attrStr attributesAtIndex: index effectiveRange: NULL] objectForKey: name]; | |
if ( value == nil ) | |
return ( found != nil ); | |
return ( [found isEqual: value] ); | |
}; | |
/////////////////////////////////////////////////////////////////////////////// | |
// setup character class detection | |
kMatchingRules[0] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self openingBrackets], aStr, idx); }; | |
kMatchingRules[1] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self closingBrackets], aStr, idx); }; | |
kMatchingRules[2] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self hyphens], aStr, idx); }; | |
kMatchingRules[3] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self dividingPunctuation], aStr, idx); }; | |
kMatchingRules[4] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self middleDots], aStr, idx); }; | |
kMatchingRules[5] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self fullStops], aStr, idx); }; | |
kMatchingRules[6] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self commas], aStr, idx); }; | |
kMatchingRules[7] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self inseparableCharacters], aStr, idx); }; | |
kMatchingRules[8] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self iterationMarks], aStr, idx); }; | |
kMatchingRules[9] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self prolongedSoundMark], aStr, idx); }; | |
kMatchingRules[10] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self smallKana], aStr, idx); }; | |
kMatchingRules[11] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self prefixedAbbreviations], aStr, idx); }; | |
kMatchingRules[12] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self postfixedAbbreviations], aStr, idx); }; | |
kMatchingRules[13] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self fullWidthIdeographicSpace], aStr, idx); }; | |
kMatchingRules[14] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self hiragana], aStr, idx); }; | |
kMatchingRules[15] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self katakana], aStr, idx); }; | |
kMatchingRules[16] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self mathSymbols], aStr, idx); }; | |
kMatchingRules[17] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self mathOperators], aStr, idx); }; | |
kMatchingRules[18] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self ideographicCharacters], aStr, idx); }; | |
kMatchingRules[19] = ^(NSAttributedString * aStr, NSUInteger idx){ return hasAttribute(KHReferenceMarkAttributeName, @YES, aStr, idx); };//[self referenceMarks]; | |
kMatchingRules[20] = ^(NSAttributedString * aStr, NSUInteger idx){ return isOrnamentedCharacterComplex(aStr, idx); };//[self ornamentedCharacterComplexes]; | |
kMatchingRules[21] = ^(NSAttributedString * aStr, NSUInteger idx){ return hasAttribute(KHRubyCharactersAttributeName, nil, aStr, idx); };//[self referenceMarks];;//[self simpleRubyCharacterComplexes]; | |
kMatchingRules[22] = ^(NSAttributedString * aStr, NSUInteger idx){ return hasAttribute(KHRubyCharactersAttributeName, nil, aStr, idx); };//[self jukugoRubyCharacterComplexes]; | |
kMatchingRules[23] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self groupedNumerals], aStr, idx); }; | |
kMatchingRules[24] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self unitSymbols], aStr, idx); }; | |
kMatchingRules[25] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self westernWordSpace], aStr, idx); }; | |
kMatchingRules[26] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self westernCharacters], aStr, idx); }; | |
kMatchingRules[27] = ^BOOL(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self warichuOpeningBrackets], aStr, idx) && hasAttribute(KHInlineCuttingNoteAttributeName, @YES, aStr, idx+1); }; | |
kMatchingRules[28] = ^BOOL(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self warichuClosingBrackets], aStr, idx) && hasAttribute(KHInlineCuttingNoteAttributeName, @YES, aStr, idx+1); }; | |
kMatchingRules[29] = ^(NSAttributedString * aStr, NSUInteger idx){ return hasAttribute(KHUseHorizontalInVerticalAttributeName, @YES, aStr, idx); };//[self tateChuYoko]; | |
///////////////////////////////////////////////////////////////////// | |
// Setup rulesets | |
BOOL (^oneCharInSet)(unichar, unichar, NSCharacterSet*) = ^BOOL(unichar be, unichar af, NSCharacterSet * set) { | |
return ( [set characterIsMember: be] || [set characterIsMember: af] ); | |
}; | |
BOOL (^oneIsMiddleDot)(unichar, unichar) = ^BOOL(unichar be, unichar af) { | |
return ( be == 0x30FB || af == 0x30FB ); | |
}; | |
BOOL (^twoIdenticalEllipses)(unichar, unichar) = ^BOOL(unichar be, unichar af) { | |
return ( (be == 0x2026 && af == 0x2026) || (be == 0x2025 && af == 0x2025) ); | |
}; | |
BOOL (^oneIsIdeographicIterationMark)(unichar, unichar) = ^BOOL(unichar be, unichar af) { | |
return ( be == 0x3005 || af == 0x3005 || be == 0x303B || af == 0x303B ); | |
}; | |
BOOL (^oneIsPercentSign)(unichar, unichar) = ^BOOL(unichar be, unichar af) { | |
return ( be == (unichar)'%' || af == (unichar)'%' ); | |
}; | |
kRulesets[KHLineBreakRulesVeryLoose][3] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self hyphens]); }; | |
kRulesets[KHLineBreakRulesVeryLoose][4] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self dividingPunctuation]); }; | |
kRulesets[KHLineBreakRulesVeryLoose][5] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self middleDots]); }; | |
kRulesets[KHLineBreakRulesVeryLoose][8] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self inseparableCharacters]); }; | |
kRulesets[KHLineBreakRulesVeryLoose][9] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self iterationMarks]); }; | |
kRulesets[KHLineBreakRulesVeryLoose][10] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self prolongedSoundMark]); }; | |
kRulesets[KHLineBreakRulesVeryLoose][11] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self smallKana]); }; | |
kRulesets[KHLineBreakRulesVeryLoose][12] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self prefixedAbbreviations]); }; | |
kRulesets[KHLineBreakRulesVeryLoose][13] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self postfixedAbbreviations]); }; | |
kRulesets[KHLineBreakRulesLoose][3] = kRulesets[KHLineBreakRulesVeryLoose][3]; | |
kRulesets[KHLineBreakRulesLoose][5] = oneIsMiddleDot; | |
kRulesets[KHLineBreakRulesLoose][8] = twoIdenticalEllipses; | |
kRulesets[KHLineBreakRulesLoose][9] = oneIsIdeographicIterationMark; | |
kRulesets[KHLineBreakRulesLoose][10] = kRulesets[KHLineBreakRulesVeryLoose][10]; | |
kRulesets[KHLineBreakRulesLoose][11] = kRulesets[KHLineBreakRulesVeryLoose][11]; | |
kRulesets[KHLineBreakRulesLoose][13] = oneIsPercentSign; | |
kRulesets[KHLineBreakRulesLoose][25] = oneIsMiddleDot; | |
kRulesets[KHLineBreakRulesLoose][27] = oneIsPercentSign; | |
kRulesets[KHLineBreakRulesStrict][9] = oneIsIdeographicIterationMark; | |
kRulesets[KHLineBreakRulesStrict][10] = kRulesets[KHLineBreakRulesVeryLoose][10]; | |
kRulesets[KHLineBreakRulesStrict][11] = kRulesets[KHLineBreakRulesVeryLoose][11]; | |
// KHLineBreakRulesVeryStrict doesn't have any additional break possibilities | |
///////////////////////////////////////////////////////////////////// | |
// TODO: setup special rule handler blocks | |
// no line break opportunity between certain pairs of characters within the inseparable character class | |
kSpecialBreakingRules[5] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakPoint, KHLineBreakRuleset ruleset) { | |
unichar be = [[attrStr string] characterAtIndex: prospectiveBreakPoint-1]; | |
unichar af = [[attrStr string] characterAtIndex: prospectiveBreakPoint]; | |
NSCharacterSet * set = [self inseparableCharacters]; | |
if ( [set characterIsMember: be] && [set characterIsMember: af] ) | |
{ | |
if ( be == 0x2025 && af == 0x2025 ) // two two-dot leaders | |
return ( NO ); | |
else if ( be == 0x2026 && af == 0x2026 ) // two ellipses | |
return ( NO ); | |
else if ( be == 0x2014 && af == 0x2014 ) // two em-dashes | |
return ( NO ); | |
else if ( be == 0x3033 && af == 0x3035 ) // vertical kana repeat mark upper & lower halves | |
return ( NO ); | |
else if ( be == 0x3035 && af == 0x3035 ) // vertical kana repeat mark voiced upper & lower halves | |
return ( NO ); | |
} | |
return ( YES ); | |
}; | |
// no line break opportunity between characters within the same ornamented character complex | |
kSpecialBreakingRules[6] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakPoint, KHLineBreakRuleset ruleset) { | |
// we know it's a match for the class already -- just get the appropriate attribute range | |
NSRange r = {NSNotFound, 0}; | |
NSDictionary * attrs = [attrStr attributesAtIndex: prospectiveBreakPoint-1 effectiveRange: &r]; | |
if ( [[attrs objectForKey: NSSuperscriptAttributeName] intValue] == 0 ) | |
{ | |
// prior character is start of ornamented complex (the base character), so no break | |
return ( NO ); | |
} | |
else if ( NSMaxRange(r) > prospectiveBreakPoint ) | |
{ | |
// the same ornament text extends beyond prospective break point, so no break | |
return ( NO ); | |
} | |
// otherwise, it's OK to break here | |
return ( YES ); | |
}; | |
// no line break opportunity within a single run of Ruby characters | |
kSpecialBreakingRules[7] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakPoint, KHLineBreakRuleset ruleset) { | |
// we know preceding char has ruby text attached-- what's the range of that attribute? | |
NSRange r = {NSNotFound, 0}; | |
(void) [attrStr attributesAtIndex: prospectiveBreakPoint-1 effectiveRange: &r]; | |
if ( NSMaxRange(r) > prospectiveBreakPoint ) | |
{ | |
// current ruby complex extends across this prospective line break | |
return ( NO ); | |
} | |
// otherwise, the current ruby complex ends at the preceding character, so we can break here | |
return ( YES ); | |
}; | |
// A simplistic interpretaion of the following: | |
/* A line break opportunity exists between two consecutive base characters belonging to different jukugo-ruby character complexes (cl-23). There is also a line break opportunity between two consecutive base characters belonging to the same jukugo-ruby character complex (cl-23) and between two runs of ruby text accompanying the corresponding base characters. However, a base character and the accompanying ruby text shall be indivisible, hence there is no line break opportunity between any two consecutive ruby characters in a run of ruby text accompanying a base character. | |
*/ | |
kSpecialBreakingRules[8] = kSpecialBreakingRules[7]; | |
// No line breaks between grouped numerals and postfixed abbreviations. | |
// This is overridden by some rulesets. | |
kSpecialBreakingRules[9] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakPoint, KHLineBreakRuleset ruleset) { | |
unichar be = [[attrStr string] characterAtIndex: prospectiveBreakPoint-1]; | |
unichar af = [[attrStr string] characterAtIndex: prospectiveBreakPoint]; | |
if ( [[self groupedNumerals] characterIsMember: be] && [[self postfixedAbbreviations] characterIsMember: af] ) | |
{ | |
// check with the ruleset | |
rulesetAddition rule = kRulesets[ruleset][KHCharacterClassPostfixedAbbreviations]; | |
if ( rule == nil ) | |
return ( NO ); // default is not to allow a break here | |
// otherwise, we let the rule decide | |
return ( rule(be, af) ); | |
} | |
// otherwise, break is OK by this rule | |
return ( YES ); | |
}; | |
// Whether to allow line breaks between grouped numerals and trailing western characters. Could go either way. | |
kSpecialBreakingRules[10] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakpoint, KHLineBreakRuleset ruleset) { | |
unichar be = [[attrStr string] characterAtIndex: prospectiveBreakpoint-1]; | |
unichar af = [[attrStr string] characterAtIndex: prospectiveBreakpoint]; | |
if ( [[self groupedNumerals] characterIsMember: be] && [[self westernCharacters] characterIsMember: af] ) | |
return ( allowBreaksBetweenNumeralsAndTrailingWesternCharacters ); // global setting to choose desired behaviour | |
// otherwise, we don't prohibit it by this rule | |
return ( YES ); | |
}; | |
// not currently implemented: | |
/* A line break opportunity generally exists between preceding Western characters (cl-27) and trailing postfixed abbreviations (cl-13), unless the preceding Western character (cl-27) is used as a symbol of a quantity or a European numeral, in which case a line break is not allowed between them. | |
*/ | |
// Western characters can only be broken at valid hyphenation points, and only by inserting a hyphen | |
kSpecialBreakingRules[12] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakpoint, KHLineBreakRuleset ruleset) { | |
unichar be = [[attrStr string] characterAtIndex: prospectiveBreakpoint-1]; | |
unichar af = [[attrStr string] characterAtIndex: prospectiveBreakpoint]; | |
if ( [[self westernCharacters] characterIsMember: be] && [[self westernCharacters] characterIsMember: af] ) | |
{ | |
// find the range of the complete word | |
NSRange upTo = NSMakeRange(0, prospectiveBreakpoint); | |
NSRange from = NSMakeRange(prospectiveBreakpoint, [attrStr length]-prospectiveBreakpoint); | |
NSCharacterSet * set = [NSCharacterSet whitespaceAndNewlineCharacterSet]; | |
NSRange wordRange = {NSNotFound, 0}; | |
wordRange.location = [[attrStr string] rangeOfCharacterFromSet: set options: NSBackwardsSearch range: upTo].location+1; | |
wordRange.length = [[attrStr string] rangeOfCharacterFromSet: set options: 0 range: from].location - wordRange.location; | |
// try to determine the language of this word | |
static NSLinguisticTagger * __tagger = nil; | |
static dispatch_once_t onceToken; | |
dispatch_once(&onceToken, ^{ | |
__tagger = [[NSLinguisticTagger alloc] initWithTagSchemes: @[ NSLinguisticTagSchemeLanguage ] options: 0]; | |
}); | |
CFLocaleRef locale = NULL; | |
@synchronized(__tagger) | |
{ | |
[__tagger setString: [[attrStr string] substringWithRange: wordRange]]; | |
NSString * language = [__tagger tagAtIndex: 0 scheme: NSLinguisticTagSchemeLanguage tokenRange: NULL sentenceRange: NULL]; | |
if ( language != nil ) | |
{ | |
CFStringRef localeID = CFLocaleCreateCanonicalLanguageIdentifierFromString(kCFAllocatorDefault, (__bridge CFStringRef)language); | |
locale = CFLocaleCreate(kCFAllocatorDefault, localeID); | |
CFRelease(localeID); | |
} | |
else | |
{ | |
locale = CFLocaleCreate(kCFAllocatorDefault, CFSTR("en")); | |
} | |
} | |
CFIndex idx = CFStringGetHyphenationLocationBeforeIndex((__bridge CFStringRef)[attrStr string], prospectiveBreakpoint+1, CFRangeMake(wordRange.location, wordRange.length), 0, locale, NULL); | |
CFRelease(locale); | |
// if the given breakpoint isn't a good place for hyphenation, then we can't break here | |
if ( idx != prospectiveBreakpoint ) | |
return ( NO ); | |
} | |
// no prohibition on line breaks according to this rule | |
return ( YES ); | |
}; | |
// No line break opportunity exists between characters within the same range of tate-chu-yoko (horizontal-in-vertical) | |
kSpecialBreakingRules[12] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakpoint, KHLineBreakRuleset ruleset) { | |
if ( [self characterAtIndex: prospectiveBreakpoint-1 ofString: attrStr isMemberOfClass: KHCharacterClassCharactersInTateChuYoko] ) | |
{ | |
NSRange r = {NSNotFound, 0}; | |
(void) [attrStr attributesAtIndex: prospectiveBreakpoint-1 effectiveRange: &r]; | |
if ( NSMaxRange(r) > prospectiveBreakpoint ) | |
return ( NO ); | |
} | |
// no prohibition under this rule | |
return ( YES ); | |
}; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment