Updated vendor copy of html-scanner lib, for bug fixes and optimizations

git-svn-id: http://svn-commit.rubyonrails.org/rails/trunk@1416 5ecf4fe2-1ee6-0310-87b1-e25e094e27de
This commit is contained in:
Jamis Buck
2005-06-14 10:30:36 +00:00
parent bca13f727e
commit c23b2a4ad3
5 changed files with 28 additions and 19 deletions

View File

@@ -1,5 +1,7 @@
*SVN*
* Updated vendor copy of html-scanner lib to 0.5.1, for bug fixes and optimizations
* Changed test requests to come from 0.0.0.0 instead of 127.0.0.1 such that they don't trigger debugging screens on exceptions, but instead call rescue_action_in_public
* Modernize scaffolding to match the generator: use the new render method and change style from the warty @params["id"] to the sleek params[:id]. #1367

View File

@@ -1,7 +1,7 @@
require 'html/tokenizer'
require 'html/node'
module HTML#:nodoc:
module HTML #:nodoc:
# A top-level HTMl document. You give it a body of text, and it will parse that
# text into a tree of nodes.
@@ -11,7 +11,7 @@ module HTML#:nodoc:
attr_reader :root
# Create a new Document from the given text.
def initialize(text)
def initialize(text, strict=false)
tokenizer = Tokenizer.new(text)
@root = Node.new(nil)
node_stack = [ @root ]
@@ -28,7 +28,7 @@ module HTML#:nodoc:
open_start = 0 if open_start < 0
close_start = node.position - 20
close_start = 0 if close_start < 0
warn <<EOF.strip
msg = <<EOF.strip
ignoring attempt to close #{node_stack.last.name} with #{node.name}
opened at byte #{node_stack.last.position}, line #{node_stack.last.line}
closed at byte #{node.position}, line #{node.line}
@@ -36,6 +36,7 @@ ignoring attempt to close #{node_stack.last.name} with #{node.name}
text around open: #{text[open_start,40].inspect}
text around close: #{text[close_start,40].inspect}
EOF
strict ? raise(msg) : warn(msg)
end
elsif node.closing != :close
node_stack.push node

View File

@@ -1,8 +1,8 @@
require 'strscan'
module HTML#:nodoc:
module HTML #:nodoc:
class Conditions < Hash#:nodoc:
class Conditions < Hash #:nodoc:
def initialize(hash)
super()
hash = { :content => hash } unless Hash === hash
@@ -54,7 +54,7 @@ module HTML#:nodoc:
end
# The base class of all nodes, textual and otherwise, in an HTML document.
class Node#:nodoc:
class Node #:nodoc:
# The array of children of this node. Not all nodes have children.
attr_reader :children
@@ -91,6 +91,8 @@ module HTML#:nodoc:
# Search the children of this node for the first node for which #find
# returns non +nil+. Returns the result of the #find call that succeeded.
def find(conditions)
conditions = validate_conditions(conditions)
@children.each do |child|
node = child.find(conditions)
return node if node
@@ -101,6 +103,8 @@ module HTML#:nodoc:
# Search for all nodes that match the given conditions, and return them
# as an array.
def find_all(conditions)
conditions = validate_conditions(conditions)
matches = []
matches << self if match(conditions)
@children.each do |child|
@@ -183,7 +187,7 @@ module HTML#:nodoc:
end
# A node that represents text, rather than markup.
class Text < Node#:nodoc:
class Text < Node #:nodoc:
attr_reader :content
@@ -239,7 +243,7 @@ module HTML#:nodoc:
# A Tag is any node that represents markup. It may be an opening tag, a
# closing tag, or a self-closing tag. It has a name, and may have a hash of
# attributes.
class Tag < Node#:nodoc:
class Tag < Node #:nodoc:
# Either +nil+, <tt>:close</tt>, or <tt>:self</tt>
attr_reader :closing
@@ -268,7 +272,9 @@ module HTML#:nodoc:
# Returns non-+nil+ if this tag can contain child nodes.
def childless?
@name =~ /^(img|br|hr|link|meta|area|base|basefont|col|frame|input|isindex|param)$/o
!@closing.nil? ||
@name =~ /^(img|br|hr|link|meta|area|base|basefont|
col|frame|input|isindex|param)$/ox
end
# Returns a textual representation of the node
@@ -284,6 +290,7 @@ module HTML#:nodoc:
s << " /" if @closing == :self
s << ">"
@children.each { |child| s << child.to_s }
s << "</#{@name}>" if @closing != :self && !@children.empty?
s
end
end

View File

@@ -1,6 +1,6 @@
require 'strscan'
module HTML#:nodoc:
module HTML #:nodoc:
# A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
# token is a string. Each string represents either "text", or an HTML element.
@@ -13,7 +13,7 @@ module HTML#:nodoc:
# while token = tokenizer.next
# p token
# end
class Tokenizer#:nodoc:
class Tokenizer #:nodoc:
# The current (byte) position in the text
attr_reader :position
@@ -51,7 +51,7 @@ module HTML#:nodoc:
tag = @scanner.getch
if @scanner.scan(/!--/) # comment
tag << @scanner.matched
tag << @scanner.scan_until(/--\s*>/)
tag << (@scanner.scan_until(/--\s*>/) || @scanner.scan_until(/\Z/))
elsif @scanner.scan(/!/) # doctype
tag << @scanner.matched
tag << consume_quoted_regions
@@ -63,14 +63,13 @@ module HTML#:nodoc:
# Scan all text up to the next < character and return it.
def scan_text
@scanner.getch + (@scanner.scan(/[^<]*/) || "")
"#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
end
# Counts the number of newlines in the text and updates the current line
# accordingly.
def update_current_line(text)
@current_line += text.scan(/\r\n|\r|\n/).length
text
text.scan(/\r?\n/) { @current_line += 1 }
end
# Skips over quoted strings, so that less-than and greater-than characters
@@ -89,7 +88,7 @@ module HTML#:nodoc:
text << match
break if delim == "<" || delim == ">"
# consume the conqued region
# consume the quoted region
while match = @scanner.scan_until(/[\\#{delim}]/)
text << match
break if @scanner.matched == delim

View File

@@ -1,9 +1,9 @@
module HTML#:nodoc:
module Version#:nodoc:
module HTML #:nodoc:
module Version #:nodoc:
MAJOR = 0
MINOR = 5
TINY = 0
TINY = 1
STRING = [ MAJOR, MINOR, TINY ].join(".")