Created
June 30, 2017 05:17
-
-
Save alpha123/e8507c69e5c9867448761d0953c38f02 to your computer and use it in GitHub Desktop.
Ruby IPA parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
require 'rubygems' | |
gem 'ruby-enum' | |
gem 'unicode' | |
require 'ruby-enum' | |
require 'unicode' | |
module IPA | |
Consonants = [ | |
['m', '', '', 'n', 'n', 'ɳ', 'ȵ', 'ɲ', 'ŋ', 'ɴ', '' ], | |
['b', '', 'd̪', 'd', 'd', 'ɖ', 'ȡ', 'ɟ', 'g', 'ɢ', 'ʔ'], | |
['', '', '', 'z', 'ʒ', 'ʐ', 'ʑ', '', '', '', '' ], | |
['β', 'v', 'ð', 'ð̠', 'ɹ̠˔','ɻ˔','ʝ̟', 'ʝ', 'ɣ', 'ʁ', 'ɦ'], | |
['', '', '', 'ɮ', '', 'ɭ˔','', 'ʎ̝', 'ʟ̝', '', ''], | |
['β̞', 'ʋ', 'ð̞', 'ɹ', 'ɹ̠', 'ɻ', '', 'j', 'ɰ', 'ʁ̞', ''], | |
['', '', 'l̪', 'l', 'l̠', 'ɭ', 'ȴ', 'ʎ', 'ʟ', 'ʟ̠', ''], | |
['ⱱ̟', 'ⱱ', '', 'ɾ', '', 'ɽ', '', '', '', '', ''], | |
['ʙ', '', '', 'r', '', 'ɽ͡r','', '', '', 'ʀ', ''] | |
].map! { |p| p.freeze }.freeze | |
FlatConsonants = Consonants.flatten.freeze | |
ConsonantsRowLen = Consonants[0].size | |
VoicingPairs = { | |
'p' => 'b', 't' => 'd', 'ʈ' => 'ɖ', 'ȶ' => 'ȡ', 'c' => 'ɟ', 'k' => 'g', 'q' => 'ɢ', 'ʔ' => 'ʔ', | |
'ɸ' => 'β', 'f' => 'v', 'θ' => 'ð', 'ç' => 'ʝ', 'x' => 'ɣ', 'χ' => 'ʁ', 'h' => 'ɦ', | |
's' => 'z', 'ʃ' => 'ʒ', 'ʂ' => 'ʐ', 'ɕ' => 'ʑ', | |
'ɬ' => 'ɮ', | |
'ʍ' => 'w' | |
}.freeze | |
VoicingPairsInv = VoicingPairs.invert.freeze | |
ConsonantModifiers = { | |
:aspirated => 'ʰ', :ejective => 'ʼ', :palatalized => 'ʲ', :velarized => 'ˠ', :labialized => 'ʷ' | |
}.freeze | |
Vowels = [ | |
['i', '', 'ɨ', '', 'ɯ'], | |
['' , 'ɪ', 'ɪ̈', 'ɯ̽', '' ], | |
['e', '', 'ɘ', '', 'ɤ'], | |
['e̞', '', 'ə', '', 'ɤ̞'], | |
['' , 'ɛ', 'ɜ', '', 'ʌ'], | |
['' , 'æ', 'ɐ', '', '',], | |
['' , 'a', 'ä', '', 'ɑ'] | |
].map! { |h| h.freeze }.freeze | |
FlatVowels = Vowels.flatten.freeze | |
VowelsRowLen = Vowels[0].size | |
RoundingPairs = { | |
'i' => 'y', 'ɨ' => 'ʉ', 'ɯ' => 'u', | |
'ɪ' => 'ʏ', 'ɪ̈' => 'ʊ̈', 'ɯ̽' => 'ʊ', | |
'e' => 'ø', 'ɘ' => 'ɵ', 'ɤ' => 'o', | |
'e̞' => 'ø̞', 'ə' => 'ɵ̞', 'ɤ̞' => 'o̞', | |
'ɛ' => 'œ', 'ɜ' => 'ɞ', 'ʌ' => 'ɔ', | |
'æ' => '' , 'ɐ' => 'ɞ̞', | |
'a' => 'ɶ', 'ä' => 'ɒ̈', 'ɑ' => 'ɒ' | |
}.freeze | |
RoundingPairsInv = RoundingPairs.invert.freeze | |
class MOA | |
include Ruby::Enum | |
define :Nasal, 0 | |
define :Stop, 1 | |
define :SibilantFricative, 2 | |
define :NonSibilantFricative, 3 | |
define :LateralFricative, 4 | |
define :Approximant, 5 | |
define :LateralApproximant, 6 | |
define :Flap, 7 | |
define :Trill, 8 | |
end | |
class POA | |
include Ruby::Enum | |
define :Bilabial, 0 | |
define :Labiodental, 1 | |
define :Dental, 2 | |
define :Alveolar, 3 | |
define :PalatoAlveolar, 4 | |
define :Retroflex, 5 | |
define :AlveoloPalatal, 6 | |
define :Palatal, 7 | |
define :Velar, 8 | |
define :Uvular, 9 | |
define :Glottal, 10 | |
end | |
class Consonant | |
attr_accessor :place, :manner, :voiced, :aspirated, :ejective, :palatalized, :velarized, :labialized, :long | |
alias :voiced? :voiced | |
alias :aspirated? :aspirated | |
alias :ejective? :ejective | |
alias :palatalized? :palatalized | |
alias :velarized? :velarized | |
alias :labialized? :labialized | |
alias :long? :long | |
def initialize(poa, moa) | |
@place = poa | |
@manner = moa | |
@affricate_type = :none | |
ConsonantModifiers.keys.each { |mod| send(:"#{mod}=", false) } | |
@long = false | |
end | |
def affricate_type; @affricate_type; end | |
def affricate_type=(type) | |
@affricate_type = type | |
@manner = Stop | |
end | |
def lateral_affricate; @affricate_type == :lateral; end | |
def lateral_affricate=(yesno) | |
affricate_type = yesno ? :lateral : :none | |
end | |
def sibilant_affricate; @affricate_type == :sibilant; end | |
def sibilant_affricate=(yesno) | |
affricate_type = yesno ? :sibilant : :none | |
end | |
def non_sibilant_affricate; @affricate_type == :non_sibilant; end | |
def non_sibilant_affricate=(yesno) | |
affricate_type = yesno ? :non_sibilant : :none | |
end | |
alias lateral_affricate? lateral_affricate | |
alias sibilant_affricate? sibilant_affricate | |
alias non_sibilant_affricate? non_sibilant_affricate | |
def to_s | |
base = Consonants[@manner][@place] | |
unless voiced? then | |
base = base[1..-1] + (VoicingPairsInv[base[0]] or base[0] + '̥') | |
end | |
unless @affricate_type == :none then | |
release = Consonant.new(@place, case @affricate_type | |
when :lateral; POA::LateralFricative | |
when :sibilant; POA::SibilantFricative | |
when :non_sibilant; POA::NonSibilantFricative | |
end) | |
release.voiced = @voiced | |
base += "͡" | |
base += release.to_s | |
end | |
base + ConsonantModifiers.inject('') do |modifiers, (prop, diacritic)| | |
modifiers + diacritic * (send(prop) ? 1 : 0) | |
end + (long? ? 'ː' : '') | |
end | |
def ==(other) | |
@place == other.place and @manner == other.manner and @affricate_type == other.affricate_type and | |
@long == other.long and ConsonantModifiers.inject(true) do |cond, modifier| | |
cond and send(modifier) == other.send(modifier) | |
end | |
end | |
end | |
class Height | |
include Ruby::Enum | |
define :Close, 0 | |
define :NearClose, 1 | |
define :CloseMid, 2 | |
define :Mid, 3 | |
define :OpenMid, 4 | |
define :NearOpen, 5 | |
define :Open, 6 | |
end | |
class Backness | |
include Ruby::Enum | |
define :Front, 0 | |
define :NearFront, 1 | |
define :Central, 2 | |
define :NearBack, 3 | |
define :Back, 4 | |
end | |
class Vowel | |
attr_accessor :height, :backness, :rounded, :tongue_root, :nasalized, :rhotic, :long | |
alias :rounded? :rounded | |
alias :nasalized? :nasalized | |
alias :rhotic? :rhotic | |
alias :long? :long | |
def initialize(height_, backness_, rounded_) | |
@height = height_ | |
@backness = backness_ | |
@rounded = rounded_ | |
@tongue_root = :normal | |
@nasalized = false | |
@rhotic = false | |
@long = false | |
end | |
def to_s | |
v = Vowels[@height][@backness] | |
if rounded? then | |
v = RoundingPairs[base] | |
end | |
if nasalized? then | |
v += '̃'; | |
end | |
v += case @tongue_root | |
when :advanced; '̘' | |
when :retracted; '̙' | |
else; '' | |
end | |
if rhotic? then | |
v += '˞' | |
end | |
if long? then | |
v += 'ː' | |
end | |
v | |
end | |
def ==(other) | |
@height == other.height and @backness == other.backness and @tongue_root == other.tongue_root and | |
@nasalized == other.nasalized and @rhotic == other.rhotic and @long == other.long | |
end | |
end | |
def self.articulation_of(ipa) | |
is_vowel = true | |
voiced_or_rounded = false | |
rowlen = VowelsRowLen | |
idx = FlatVowels.index ipa | |
if idx.nil? then | |
idx = FlatVowels.index RoundingPairs[ipa] | |
if idx.nil? then | |
is_vowel = false | |
rowlen = ConsonantsRowLen | |
voiced_or_rounded = true | |
idx = FlatConsonants.index ipa | |
if idx.nil? then | |
idx = FlatConsonants.index VoicingPairs[ipa] | |
voiced_or_rounded = false | |
end | |
else | |
voiced_or_rounded = true | |
end | |
end | |
[is_vowel, idx / rowlen, idx % rowlen, voiced_or_rounded] | |
end | |
private_class_method :articulation_of | |
def self.parse_single(ipa) | |
base, *modifiers = ipa | |
atr = base.index('̘') != nil | |
rtr = base.index('̙') != nil | |
nasal = base.index('̃') != nil | |
ainfo = articulation_of(base.gsub(/̘|̙|̃|˞/, '')) | |
if ainfo[0] then # Is vowel | |
v = Vowel.new ainfo[1], ainfo[2], ainfo[3] | |
v.tongue_root = atr ? :advanced : rtr ? :retracted : :normal | |
v.nasalized = nasal | |
v.rhotic = modifiers.include? '˞' | |
v.long = modifiers.include? 'ː' | |
return v | |
end | |
c = Consonant.new ainfo[2], ainfo[1] | |
c.voiced = ainfo[3] | |
if modifiers.size > 0 and Unicode.abbr_categories(modifiers[0]) == :Ll then | |
release = modifiers.shift | |
c.affricate_type = case articulation_of(release)[0] | |
when POA::LateralFricative; :lateral | |
when POA::SibilantFricative; :sibilant | |
when POA::NonSibilantFricative; :non_sibilant | |
end | |
end | |
ConsonantModifiers.each do |prop, char| | |
if modifiers.include? char then | |
c.send "#{prop}=", true | |
end | |
end | |
if modifiers.include? 'ː' then | |
c.long = true | |
end | |
c | |
end | |
def self.parse(ipa) | |
if ipa.is_a? Regexp then | |
ipa = ipa.source | |
else | |
ipa = ipa.to_s | |
end | |
# Separate the array of graphemes into contiguous subarrays containing a letter and modifiers | |
# Lm and Sk are Unicode categories modifier letter and modifier symbol | |
graphemes = Unicode.text_elements ipa | |
groups = graphemes.inject([]) do |grps, grapheme| | |
group = grps.last | |
if !(Unicode.abbr_categories(grapheme) & [:Lm, :Sk]).empty? then | |
group << grapheme | |
elsif group and group.last[-1] == '͡'.force_encoding('utf-8') then | |
group << grapheme | |
else | |
group = [grapheme] | |
grps << group | |
end | |
grps | |
end | |
groups.map(&method(:parse_single)) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment