Skip to content

Instantly share code, notes, and snippets.

@AlanQuatermain
Created March 27, 2012 20:18
Show Gist options
  • Save AlanQuatermain/2219930 to your computer and use it in GitHub Desktop.
Save AlanQuatermain/2219930 to your computer and use it in GitHub Desktop.
Setting up rulesets for line-breaking Japanese text
+ (void) initialize
{
if ( self != [KHLineBreaker class] )
return;
typedef BOOL (^charsetClassMatchingRule)(NSCharacterSet * charset, NSAttributedString * attrStr, NSUInteger index);
charsetClassMatchingRule inCharset = ^(NSCharacterSet * charset, NSAttributedString * attrStr, NSUInteger index) {
return ( [charset characterIsMember: [[attrStr string] characterAtIndex: index]] );
};
BOOL (^isOrnamentedCharacterComplex)(NSAttributedString*, NSUInteger) = ^BOOL(NSAttributedString * attrStr, NSUInteger index) {
// match if current or following character is set super or subscript
NSDictionary * attrsOn = [attrStr attributesAtIndex: index effectiveRange: NULL];
if ( [[attrsOn objectForKey: NSSuperscriptAttributeName] intValue] != 0 )
return ( YES );
if ( [[[attrStr attributesAtIndex: index+1 effectiveRange: NULL] objectForKey: NSSuperscriptAttributeName] intValue] != 0 )
return ( YES );
return ( NO );
};
typedef BOOL (^attributeMatchingRule)(NSString * name, id value, NSAttributedString * attrStr, NSUInteger index);
attributeMatchingRule hasAttribute = ^BOOL(NSString * name, id value, NSAttributedString * attrStr, NSUInteger index) {
id found = [[attrStr attributesAtIndex: index effectiveRange: NULL] objectForKey: name];
if ( value == nil )
return ( found != nil );
return ( [found isEqual: value] );
};
///////////////////////////////////////////////////////////////////////////////
// setup character class detection
kMatchingRules[0] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self openingBrackets], aStr, idx); };
kMatchingRules[1] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self closingBrackets], aStr, idx); };
kMatchingRules[2] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self hyphens], aStr, idx); };
kMatchingRules[3] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self dividingPunctuation], aStr, idx); };
kMatchingRules[4] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self middleDots], aStr, idx); };
kMatchingRules[5] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self fullStops], aStr, idx); };
kMatchingRules[6] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self commas], aStr, idx); };
kMatchingRules[7] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self inseparableCharacters], aStr, idx); };
kMatchingRules[8] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self iterationMarks], aStr, idx); };
kMatchingRules[9] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self prolongedSoundMark], aStr, idx); };
kMatchingRules[10] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self smallKana], aStr, idx); };
kMatchingRules[11] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self prefixedAbbreviations], aStr, idx); };
kMatchingRules[12] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self postfixedAbbreviations], aStr, idx); };
kMatchingRules[13] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self fullWidthIdeographicSpace], aStr, idx); };
kMatchingRules[14] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self hiragana], aStr, idx); };
kMatchingRules[15] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self katakana], aStr, idx); };
kMatchingRules[16] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self mathSymbols], aStr, idx); };
kMatchingRules[17] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self mathOperators], aStr, idx); };
kMatchingRules[18] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self ideographicCharacters], aStr, idx); };
kMatchingRules[19] = ^(NSAttributedString * aStr, NSUInteger idx){ return hasAttribute(KHReferenceMarkAttributeName, @YES, aStr, idx); };//[self referenceMarks];
kMatchingRules[20] = ^(NSAttributedString * aStr, NSUInteger idx){ return isOrnamentedCharacterComplex(aStr, idx); };//[self ornamentedCharacterComplexes];
kMatchingRules[21] = ^(NSAttributedString * aStr, NSUInteger idx){ return hasAttribute(KHRubyCharactersAttributeName, nil, aStr, idx); };//[self referenceMarks];;//[self simpleRubyCharacterComplexes];
kMatchingRules[22] = ^(NSAttributedString * aStr, NSUInteger idx){ return hasAttribute(KHRubyCharactersAttributeName, nil, aStr, idx); };//[self jukugoRubyCharacterComplexes];
kMatchingRules[23] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self groupedNumerals], aStr, idx); };
kMatchingRules[24] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self unitSymbols], aStr, idx); };
kMatchingRules[25] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self westernWordSpace], aStr, idx); };
kMatchingRules[26] = ^(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self westernCharacters], aStr, idx); };
kMatchingRules[27] = ^BOOL(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self warichuOpeningBrackets], aStr, idx) && hasAttribute(KHInlineCuttingNoteAttributeName, @YES, aStr, idx+1); };
kMatchingRules[28] = ^BOOL(NSAttributedString * aStr, NSUInteger idx){ return inCharset([self warichuClosingBrackets], aStr, idx) && hasAttribute(KHInlineCuttingNoteAttributeName, @YES, aStr, idx+1); };
kMatchingRules[29] = ^(NSAttributedString * aStr, NSUInteger idx){ return hasAttribute(KHUseHorizontalInVerticalAttributeName, @YES, aStr, idx); };//[self tateChuYoko];
/////////////////////////////////////////////////////////////////////
// Setup rulesets
BOOL (^oneCharInSet)(unichar, unichar, NSCharacterSet*) = ^BOOL(unichar be, unichar af, NSCharacterSet * set) {
return ( [set characterIsMember: be] || [set characterIsMember: af] );
};
BOOL (^oneIsMiddleDot)(unichar, unichar) = ^BOOL(unichar be, unichar af) {
return ( be == 0x30FB || af == 0x30FB );
};
BOOL (^twoIdenticalEllipses)(unichar, unichar) = ^BOOL(unichar be, unichar af) {
return ( (be == 0x2026 && af == 0x2026) || (be == 0x2025 && af == 0x2025) );
};
BOOL (^oneIsIdeographicIterationMark)(unichar, unichar) = ^BOOL(unichar be, unichar af) {
return ( be == 0x3005 || af == 0x3005 || be == 0x303B || af == 0x303B );
};
BOOL (^oneIsPercentSign)(unichar, unichar) = ^BOOL(unichar be, unichar af) {
return ( be == (unichar)'%' || af == (unichar)'%' );
};
kRulesets[KHLineBreakRulesVeryLoose][3] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self hyphens]); };
kRulesets[KHLineBreakRulesVeryLoose][4] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self dividingPunctuation]); };
kRulesets[KHLineBreakRulesVeryLoose][5] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self middleDots]); };
kRulesets[KHLineBreakRulesVeryLoose][8] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self inseparableCharacters]); };
kRulesets[KHLineBreakRulesVeryLoose][9] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self iterationMarks]); };
kRulesets[KHLineBreakRulesVeryLoose][10] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self prolongedSoundMark]); };
kRulesets[KHLineBreakRulesVeryLoose][11] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self smallKana]); };
kRulesets[KHLineBreakRulesVeryLoose][12] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self prefixedAbbreviations]); };
kRulesets[KHLineBreakRulesVeryLoose][13] = ^(unichar be, unichar af) { return oneCharInSet(be, af, [self postfixedAbbreviations]); };
kRulesets[KHLineBreakRulesLoose][3] = kRulesets[KHLineBreakRulesVeryLoose][3];
kRulesets[KHLineBreakRulesLoose][5] = oneIsMiddleDot;
kRulesets[KHLineBreakRulesLoose][8] = twoIdenticalEllipses;
kRulesets[KHLineBreakRulesLoose][9] = oneIsIdeographicIterationMark;
kRulesets[KHLineBreakRulesLoose][10] = kRulesets[KHLineBreakRulesVeryLoose][10];
kRulesets[KHLineBreakRulesLoose][11] = kRulesets[KHLineBreakRulesVeryLoose][11];
kRulesets[KHLineBreakRulesLoose][13] = oneIsPercentSign;
kRulesets[KHLineBreakRulesLoose][25] = oneIsMiddleDot;
kRulesets[KHLineBreakRulesLoose][27] = oneIsPercentSign;
kRulesets[KHLineBreakRulesStrict][9] = oneIsIdeographicIterationMark;
kRulesets[KHLineBreakRulesStrict][10] = kRulesets[KHLineBreakRulesVeryLoose][10];
kRulesets[KHLineBreakRulesStrict][11] = kRulesets[KHLineBreakRulesVeryLoose][11];
// KHLineBreakRulesVeryStrict doesn't have any additional break possibilities
/////////////////////////////////////////////////////////////////////
// TODO: setup special rule handler blocks
// no line break opportunity between certain pairs of characters within the inseparable character class
kSpecialBreakingRules[5] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakPoint, KHLineBreakRuleset ruleset) {
unichar be = [[attrStr string] characterAtIndex: prospectiveBreakPoint-1];
unichar af = [[attrStr string] characterAtIndex: prospectiveBreakPoint];
NSCharacterSet * set = [self inseparableCharacters];
if ( [set characterIsMember: be] && [set characterIsMember: af] )
{
if ( be == 0x2025 && af == 0x2025 ) // two two-dot leaders
return ( NO );
else if ( be == 0x2026 && af == 0x2026 ) // two ellipses
return ( NO );
else if ( be == 0x2014 && af == 0x2014 ) // two em-dashes
return ( NO );
else if ( be == 0x3033 && af == 0x3035 ) // vertical kana repeat mark upper & lower halves
return ( NO );
else if ( be == 0x3035 && af == 0x3035 ) // vertical kana repeat mark voiced upper & lower halves
return ( NO );
}
return ( YES );
};
// no line break opportunity between characters within the same ornamented character complex
kSpecialBreakingRules[6] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakPoint, KHLineBreakRuleset ruleset) {
// we know it's a match for the class already -- just get the appropriate attribute range
NSRange r = {NSNotFound, 0};
NSDictionary * attrs = [attrStr attributesAtIndex: prospectiveBreakPoint-1 effectiveRange: &r];
if ( [[attrs objectForKey: NSSuperscriptAttributeName] intValue] == 0 )
{
// prior character is start of ornamented complex (the base character), so no break
return ( NO );
}
else if ( NSMaxRange(r) > prospectiveBreakPoint )
{
// the same ornament text extends beyond prospective break point, so no break
return ( NO );
}
// otherwise, it's OK to break here
return ( YES );
};
// no line break opportunity within a single run of Ruby characters
kSpecialBreakingRules[7] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakPoint, KHLineBreakRuleset ruleset) {
// we know preceding char has ruby text attached-- what's the range of that attribute?
NSRange r = {NSNotFound, 0};
(void) [attrStr attributesAtIndex: prospectiveBreakPoint-1 effectiveRange: &r];
if ( NSMaxRange(r) > prospectiveBreakPoint )
{
// current ruby complex extends across this prospective line break
return ( NO );
}
// otherwise, the current ruby complex ends at the preceding character, so we can break here
return ( YES );
};
// A simplistic interpretaion of the following:
/* A line break opportunity exists between two consecutive base characters belonging to different jukugo-ruby character complexes (cl-23). There is also a line break opportunity between two consecutive base characters belonging to the same jukugo-ruby character complex (cl-23) and between two runs of ruby text accompanying the corresponding base characters. However, a base character and the accompanying ruby text shall be indivisible, hence there is no line break opportunity between any two consecutive ruby characters in a run of ruby text accompanying a base character.
*/
kSpecialBreakingRules[8] = kSpecialBreakingRules[7];
// No line breaks between grouped numerals and postfixed abbreviations.
// This is overridden by some rulesets.
kSpecialBreakingRules[9] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakPoint, KHLineBreakRuleset ruleset) {
unichar be = [[attrStr string] characterAtIndex: prospectiveBreakPoint-1];
unichar af = [[attrStr string] characterAtIndex: prospectiveBreakPoint];
if ( [[self groupedNumerals] characterIsMember: be] && [[self postfixedAbbreviations] characterIsMember: af] )
{
// check with the ruleset
rulesetAddition rule = kRulesets[ruleset][KHCharacterClassPostfixedAbbreviations];
if ( rule == nil )
return ( NO ); // default is not to allow a break here
// otherwise, we let the rule decide
return ( rule(be, af) );
}
// otherwise, break is OK by this rule
return ( YES );
};
// Whether to allow line breaks between grouped numerals and trailing western characters. Could go either way.
kSpecialBreakingRules[10] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakpoint, KHLineBreakRuleset ruleset) {
unichar be = [[attrStr string] characterAtIndex: prospectiveBreakpoint-1];
unichar af = [[attrStr string] characterAtIndex: prospectiveBreakpoint];
if ( [[self groupedNumerals] characterIsMember: be] && [[self westernCharacters] characterIsMember: af] )
return ( allowBreaksBetweenNumeralsAndTrailingWesternCharacters ); // global setting to choose desired behaviour
// otherwise, we don't prohibit it by this rule
return ( YES );
};
// not currently implemented:
/* A line break opportunity generally exists between preceding Western characters (cl-27) and trailing postfixed abbreviations (cl-13), unless the preceding Western character (cl-27) is used as a symbol of a quantity or a European numeral, in which case a line break is not allowed between them.
*/
// Western characters can only be broken at valid hyphenation points, and only by inserting a hyphen
kSpecialBreakingRules[12] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakpoint, KHLineBreakRuleset ruleset) {
unichar be = [[attrStr string] characterAtIndex: prospectiveBreakpoint-1];
unichar af = [[attrStr string] characterAtIndex: prospectiveBreakpoint];
if ( [[self westernCharacters] characterIsMember: be] && [[self westernCharacters] characterIsMember: af] )
{
// find the range of the complete word
NSRange upTo = NSMakeRange(0, prospectiveBreakpoint);
NSRange from = NSMakeRange(prospectiveBreakpoint, [attrStr length]-prospectiveBreakpoint);
NSCharacterSet * set = [NSCharacterSet whitespaceAndNewlineCharacterSet];
NSRange wordRange = {NSNotFound, 0};
wordRange.location = [[attrStr string] rangeOfCharacterFromSet: set options: NSBackwardsSearch range: upTo].location+1;
wordRange.length = [[attrStr string] rangeOfCharacterFromSet: set options: 0 range: from].location - wordRange.location;
// try to determine the language of this word
static NSLinguisticTagger * __tagger = nil;
static dispatch_once_t onceToken;
dispatch_once(&onceToken, ^{
__tagger = [[NSLinguisticTagger alloc] initWithTagSchemes: @[ NSLinguisticTagSchemeLanguage ] options: 0];
});
CFLocaleRef locale = NULL;
@synchronized(__tagger)
{
[__tagger setString: [[attrStr string] substringWithRange: wordRange]];
NSString * language = [__tagger tagAtIndex: 0 scheme: NSLinguisticTagSchemeLanguage tokenRange: NULL sentenceRange: NULL];
if ( language != nil )
{
CFStringRef localeID = CFLocaleCreateCanonicalLanguageIdentifierFromString(kCFAllocatorDefault, (__bridge CFStringRef)language);
locale = CFLocaleCreate(kCFAllocatorDefault, localeID);
CFRelease(localeID);
}
else
{
locale = CFLocaleCreate(kCFAllocatorDefault, CFSTR("en"));
}
}
CFIndex idx = CFStringGetHyphenationLocationBeforeIndex((__bridge CFStringRef)[attrStr string], prospectiveBreakpoint+1, CFRangeMake(wordRange.location, wordRange.length), 0, locale, NULL);
CFRelease(locale);
// if the given breakpoint isn't a good place for hyphenation, then we can't break here
if ( idx != prospectiveBreakpoint )
return ( NO );
}
// no prohibition on line breaks according to this rule
return ( YES );
};
// No line break opportunity exists between characters within the same range of tate-chu-yoko (horizontal-in-vertical)
kSpecialBreakingRules[12] = ^BOOL(NSAttributedString * attrStr, NSUInteger prospectiveBreakpoint, KHLineBreakRuleset ruleset) {
if ( [self characterAtIndex: prospectiveBreakpoint-1 ofString: attrStr isMemberOfClass: KHCharacterClassCharactersInTateChuYoko] )
{
NSRange r = {NSNotFound, 0};
(void) [attrStr attributesAtIndex: prospectiveBreakpoint-1 effectiveRange: &r];
if ( NSMaxRange(r) > prospectiveBreakpoint )
return ( NO );
}
// no prohibition under this rule
return ( YES );
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment