mirror of
https://github.com/github/rails.git
synced 2026-01-28 15:58:03 -05:00
Pull in latest multibyte patch. Closes #6346 [Manfred Stienstra]
git-svn-id: http://svn-commit.rubyonrails.org/rails/trunk@5224 5ecf4fe2-1ee6-0310-87b1-e25e094e27de
This commit is contained in:
@@ -1,6 +1,8 @@
|
||||
*SVN*
|
||||
|
||||
* Add ActiveSupport::Multibyte. Provides String#chars which lets you deal with strings as a sequence of chars, not of bytes. Closes #6242 [Julian Tarkhanov, Manfred Stienstra & Jan Behrens]
|
||||
* Pull in latest multibye patch. Closes #6346 [Manfred Stienstra]
|
||||
|
||||
* Add ActiveSupport::Multibyte. Provides String#chars which lets you deal with strings as a sequence of chars, not of bytes. Closes #6242 [Julian Tarkhanov, Manfred Stienstra, Thijs van der Vossen & Jan Behrens]
|
||||
|
||||
* Fix issue with #class_inheritable_accessor saving updates to the parent class when initialized with an Array or Hash [mojombo]
|
||||
|
||||
|
||||
@@ -18,7 +18,8 @@ module ActiveSupport::Multibyte::Handlers #:nodoc:
|
||||
SOURCES = {
|
||||
:codepoints => BASE_URI + 'UnicodeData.txt',
|
||||
:composition_exclusion => BASE_URI + 'CompositionExclusions.txt',
|
||||
:grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt'
|
||||
:grapheme_break_property => BASE_URI + 'auxiliary/GraphemeBreakProperty.txt',
|
||||
:cp1252 => 'http://unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT'
|
||||
}
|
||||
|
||||
def initialize
|
||||
@@ -33,6 +34,7 @@ module ActiveSupport::Multibyte::Handlers #:nodoc:
|
||||
@ucd.composition_exclusion = []
|
||||
@ucd.composition_map = {}
|
||||
@ucd.boundary = {}
|
||||
@ucd.cp1252 = {}
|
||||
end
|
||||
|
||||
def parse_codepoints(line)
|
||||
@@ -87,6 +89,12 @@ module ActiveSupport::Multibyte::Handlers #:nodoc:
|
||||
end
|
||||
end
|
||||
|
||||
def parse_cp1252(line)
|
||||
if line =~ /^([0-9A-Fx]+)\s([0-9A-Fx]+)/i
|
||||
@ucd.cp1252[$1.hex] = $2.hex
|
||||
end
|
||||
end
|
||||
|
||||
def create_composition_map
|
||||
@ucd.codepoints.each do |_, cp|
|
||||
if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code)
|
||||
@@ -125,7 +133,7 @@ module ActiveSupport::Multibyte::Handlers #:nodoc:
|
||||
|
||||
def dump_to(filename)
|
||||
File.open(filename, 'wb') do |f|
|
||||
f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary])
|
||||
f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary, @ucd.cp1252])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -7,12 +7,12 @@ module ActiveSupport::Multibyte::Handlers
|
||||
end
|
||||
|
||||
class UnicodeDatabase #:nodoc:
|
||||
attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary
|
||||
attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary, :cp1252
|
||||
|
||||
# Creates a new UnicodeDatabase instance and loads the database.
|
||||
def initialize
|
||||
begin
|
||||
@codepoints, @composition_exclusion, @composition_map, @boundary = self.class.load
|
||||
@codepoints, @composition_exclusion, @composition_map, @boundary, @cp1252 = self.class.load
|
||||
rescue Exception => e
|
||||
raise IOError.new("Couldn't load the unicode tables for UTF8Handler (#{e.message}), handler is unusable")
|
||||
end
|
||||
@@ -20,6 +20,7 @@ module ActiveSupport::Multibyte::Handlers
|
||||
@composition_exclusion ||= []
|
||||
@composition_map ||= {}
|
||||
@boundary ||= {}
|
||||
@cp1252 ||= {}
|
||||
|
||||
# Redefine the === method so we can write shorter rules for grapheme cluster breaks
|
||||
@boundary.each do |k,_|
|
||||
@@ -41,26 +42,11 @@ module ActiveSupport::Multibyte::Handlers
|
||||
|
||||
# Returns the filename for the data file for this version
|
||||
def self.filename
|
||||
File.expand_path File.join(dirname, "unicode_tables-#{VERSION}.dat")
|
||||
File.expand_path File.join(dirname, "unicode_tables.dat")
|
||||
end
|
||||
|
||||
# Loads the unicode database and returns all the internal objects of UnicodeDatabase
|
||||
def self.load
|
||||
begin
|
||||
return load_file(filename)
|
||||
rescue Exception
|
||||
# If we can't load our own version, try the rest
|
||||
Dir["#{dirname}/*.dat"].sort.each do |dat|
|
||||
begin
|
||||
return load_file(dat)
|
||||
rescue Exception
|
||||
end
|
||||
end
|
||||
end
|
||||
raise IOError.new("Can't load a marshal file for your version of Ruby")
|
||||
end
|
||||
|
||||
def self.load_file(filename)
|
||||
File.open(self.filename, 'rb') { |f| Marshal.load f.read }
|
||||
end
|
||||
end
|
||||
@@ -275,7 +261,11 @@ module ActiveSupport::Multibyte::Handlers
|
||||
|
||||
# Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string
|
||||
def tidy_bytes(str)
|
||||
str.split(//u).reject { |c| !UTF8_PAT.match(c) }.join
|
||||
str.unpack('C*').map { |n|
|
||||
n < 128 ? n.chr :
|
||||
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
|
||||
n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
|
||||
}.join
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
Binary file not shown.
Binary file not shown.
BIN
activesupport/lib/active_support/values/unicode_tables.dat
Normal file
BIN
activesupport/lib/active_support/values/unicode_tables.dat
Normal file
Binary file not shown.
@@ -139,14 +139,15 @@ class CharsTest < Test::Unit::TestCase
|
||||
|
||||
def test_resilience
|
||||
assert_nothing_raised do
|
||||
assert_equal 1, @s[:bytes].chars.size, "There's only one valid utf-8 byte in the string"
|
||||
assert_equal 5, @s[:bytes].chars.size, "The sequence contains five interpretable bytes"
|
||||
end
|
||||
reversed = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].reverse.pack('U*')
|
||||
assert_nothing_raised do
|
||||
assert_equal "\010", @s[:bytes].chars.reverse, "There's only one valid utf-8 byte in the string"
|
||||
assert_equal reversed, @s[:bytes].chars.reverse.to_s, "Reversing the string should only yield interpretable bytes"
|
||||
end
|
||||
assert_nothing_raised do
|
||||
@s[:bytes].chars.reverse!
|
||||
assert_equal "\010", @s[:bytes], "There's only one valid utf-8 byte in the string"
|
||||
assert_equal reversed, @s[:bytes].to_s, "Reversing the string should only yield interpretable bytes"
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -224,9 +224,17 @@ module UTF8HandlingTest
|
||||
end
|
||||
|
||||
def test_tidy_bytes
|
||||
assert_equal "\010", @handler.tidy_bytes(@bytestring)
|
||||
assert_equal "a\010a", @handler.tidy_bytes('a' + @bytestring + 'a')
|
||||
result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*')
|
||||
assert_equal result, @handler.tidy_bytes(@bytestring)
|
||||
assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a')
|
||||
assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') }
|
||||
|
||||
assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla
|
||||
assert_equal "\xC2\xA9", @handler.tidy_bytes("\xA9") # iso_8859_1: copyright symbol
|
||||
assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote
|
||||
assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro
|
||||
assert_equal "\x00", @handler.tidy_bytes("\x00") # null char
|
||||
assert_equal [0xef, 0xbf, 0xbd].pack('U*'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
Reference in New Issue
Block a user