|
@@ -0,0 +1,212 @@
|
|
|
+From 0496940d5998ccbc50d16fb734993ab50fc60c2d Mon Sep 17 00:00:00 2001
|
|
|
+From: NAITOH Jun <naitoh@gmail.com>
|
|
|
+Date: Mon, 18 Mar 2024 23:30:47 +0900
|
|
|
+Subject: [PATCH] Optimize the parse_attributes method to use `Source#match`
|
|
|
+ to parse XML. (#119)
|
|
|
+
|
|
|
+## Why?
|
|
|
+
|
|
|
+Improve maintainability by consolidating processing into `Source#match`.
|
|
|
+
|
|
|
+## Benchmark
|
|
|
+```
|
|
|
+RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml
|
|
|
+ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22]
|
|
|
+Calculating -------------------------------------
|
|
|
+ before after before(YJIT) after(YJIT)
|
|
|
+ dom 10.891 10.622 16.356 17.403 i/s - 100.000 times in 9.182130s 9.414177s 6.113806s 5.746133s
|
|
|
+ sax 30.335 29.845 49.749 54.877 i/s - 100.000 times in 3.296483s 3.350595s 2.010071s 1.822259s
|
|
|
+ pull 35.514 34.801 61.123 66.908 i/s - 100.000 times in 2.815793s 2.873484s 1.636041s 1.494591s
|
|
|
+ stream 35.141 34.475 52.110 56.836 i/s - 100.000 times in 2.845646s 2.900638s 1.919017s 1.759456s
|
|
|
+
|
|
|
+Comparison:
|
|
|
+ dom
|
|
|
+ after(YJIT): 17.4 i/s
|
|
|
+ before(YJIT): 16.4 i/s - 1.06x slower
|
|
|
+ before: 10.9 i/s - 1.60x slower
|
|
|
+ after: 10.6 i/s - 1.64x slower
|
|
|
+
|
|
|
+ sax
|
|
|
+ after(YJIT): 54.9 i/s
|
|
|
+ before(YJIT): 49.7 i/s - 1.10x slower
|
|
|
+ before: 30.3 i/s - 1.81x slower
|
|
|
+ after: 29.8 i/s - 1.84x slower
|
|
|
+
|
|
|
+ pull
|
|
|
+ after(YJIT): 66.9 i/s
|
|
|
+ before(YJIT): 61.1 i/s - 1.09x slower
|
|
|
+ before: 35.5 i/s - 1.88x slower
|
|
|
+ after: 34.8 i/s - 1.92x slower
|
|
|
+
|
|
|
+ stream
|
|
|
+ after(YJIT): 56.8 i/s
|
|
|
+ before(YJIT): 52.1 i/s - 1.09x slower
|
|
|
+ before: 35.1 i/s - 1.62x slower
|
|
|
+ after: 34.5 i/s - 1.65x slower
|
|
|
+
|
|
|
+```
|
|
|
+
|
|
|
+- YJIT=ON : 1.06x - 1.10x faster
|
|
|
+- YJIT=OFF : 0.97x - 0.98x faster
|
|
|
+
|
|
|
+CVE: CVE-2024-43398
|
|
|
+
|
|
|
+Upstream-Status: Backport [https://github.com/ruby/rexml/commit/0496940d5998ccbc50d16fb734993ab50fc60c2d]
|
|
|
+
|
|
|
+Signed-off-by: Rob Woolley <rob.woolley@windriver.com>
|
|
|
+---
|
|
|
+ lib/rexml/parsers/baseparser.rb | 116 ++++++++++++--------------------
|
|
|
+ test/parse/test_element.rb | 4 +-
|
|
|
+ test/test_core.rb | 20 +++++-
|
|
|
+ 3 files changed, 64 insertions(+), 76 deletions(-)
|
|
|
+
|
|
|
+Index: ruby-3.1.3/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb
|
|
|
+===================================================================
|
|
|
+--- ruby-3.1.3.orig/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb
|
|
|
++++ ruby-3.1.3/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb
|
|
|
+@@ -114,7 +114,7 @@ module REXML
|
|
|
+
|
|
|
+ module Private
|
|
|
+ INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
|
|
|
+- TAG_PATTERN = /((?>#{QNAME_STR}))/um
|
|
|
++ TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
|
|
|
+ CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
|
|
|
+ ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
|
|
|
+ NAME_PATTERN = /\s*#{NAME}/um
|
|
|
+@@ -136,7 +136,6 @@ module REXML
|
|
|
+ self.stream = source
|
|
|
+ @listeners = []
|
|
|
+ @entity_expansion_count = 0
|
|
|
+- @attributes_scanner = StringScanner.new('')
|
|
|
+ end
|
|
|
+
|
|
|
+ def add_listener( listener )
|
|
|
+@@ -635,86 +634,60 @@ module REXML
|
|
|
+ def parse_attributes(prefixes, curr_ns)
|
|
|
+ attributes = {}
|
|
|
+ closed = false
|
|
|
+- match_data = @source.match(/^(.*?)(\/)?>/um, true)
|
|
|
+- if match_data.nil?
|
|
|
+- message = "Start tag isn't ended"
|
|
|
+- raise REXML::ParseException.new(message, @source)
|
|
|
+- end
|
|
|
++ while true
|
|
|
++ if @source.match(">", true)
|
|
|
++ return attributes, closed
|
|
|
++ elsif @source.match("/>", true)
|
|
|
++ closed = true
|
|
|
++ return attributes, closed
|
|
|
++ elsif match = @source.match(QNAME, true)
|
|
|
++ name = match[1]
|
|
|
++ prefix = match[2]
|
|
|
++ local_part = match[3]
|
|
|
+
|
|
|
+- raw_attributes = match_data[1]
|
|
|
+- closed = !match_data[2].nil?
|
|
|
+- return attributes, closed if raw_attributes.nil?
|
|
|
+- return attributes, closed if raw_attributes.empty?
|
|
|
+-
|
|
|
+- @attributes_scanner.string = raw_attributes
|
|
|
+- scanner = @attributes_scanner
|
|
|
+- until scanner.eos?
|
|
|
+- if scanner.scan(/\s+/)
|
|
|
+- break if scanner.eos?
|
|
|
+- end
|
|
|
+-
|
|
|
+- pos = scanner.pos
|
|
|
+- while true
|
|
|
+- break if scanner.scan(ATTRIBUTE_PATTERN)
|
|
|
+- unless scanner.scan(QNAME)
|
|
|
+- message = "Invalid attribute name: <#{scanner.rest}>"
|
|
|
+- raise REXML::ParseException.new(message, @source)
|
|
|
+- end
|
|
|
+- name = scanner[0]
|
|
|
+- unless scanner.scan(/\s*=\s*/um)
|
|
|
++ unless @source.match(/\s*=\s*/um, true)
|
|
|
+ message = "Missing attribute equal: <#{name}>"
|
|
|
+ raise REXML::ParseException.new(message, @source)
|
|
|
+ end
|
|
|
+- quote = scanner.scan(/['"]/)
|
|
|
+- unless quote
|
|
|
+- message = "Missing attribute value start quote: <#{name}>"
|
|
|
+- raise REXML::ParseException.new(message, @source)
|
|
|
+- end
|
|
|
+- unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
|
|
|
+- match_data = @source.match(/^(.*?)(\/)?>/um, true)
|
|
|
+- if match_data
|
|
|
+- scanner << "/" if closed
|
|
|
+- scanner << ">"
|
|
|
+- scanner << match_data[1]
|
|
|
+- scanner.pos = pos
|
|
|
+- closed = !match_data[2].nil?
|
|
|
+- next
|
|
|
++ unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
|
|
|
++ if match = @source.match(/(['"])/, true)
|
|
|
++ message =
|
|
|
++ "Missing attribute value end quote: <#{name}>: <#{match[1]}>"
|
|
|
++ raise REXML::ParseException.new(message, @source)
|
|
|
++ else
|
|
|
++ message = "Missing attribute value start quote: <#{name}>"
|
|
|
++ raise REXML::ParseException.new(message, @source)
|
|
|
+ end
|
|
|
+- message =
|
|
|
+- "Missing attribute value end quote: <#{name}>: <#{quote}>"
|
|
|
+- raise REXML::ParseException.new(message, @source)
|
|
|
+ end
|
|
|
+- end
|
|
|
+- name = scanner[1]
|
|
|
+- prefix = scanner[2]
|
|
|
+- local_part = scanner[3]
|
|
|
+- # quote = scanner[4]
|
|
|
+- value = scanner[5]
|
|
|
+- if prefix == "xmlns"
|
|
|
+- if local_part == "xml"
|
|
|
+- if value != "http://www.w3.org/XML/1998/namespace"
|
|
|
+- msg = "The 'xml' prefix must not be bound to any other namespace "+
|
|
|
++ value = match[2]
|
|
|
++ if prefix == "xmlns"
|
|
|
++ if local_part == "xml"
|
|
|
++ if value != "http://www.w3.org/XML/1998/namespace"
|
|
|
++ msg = "The 'xml' prefix must not be bound to any other namespace "+
|
|
|
++ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
|
|
++ raise REXML::ParseException.new( msg, @source, self )
|
|
|
++ end
|
|
|
++ elsif local_part == "xmlns"
|
|
|
++ msg = "The 'xmlns' prefix must not be declared "+
|
|
|
+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
|
|
+- raise REXML::ParseException.new( msg, @source, self )
|
|
|
++ raise REXML::ParseException.new( msg, @source, self)
|
|
|
+ end
|
|
|
+- elsif local_part == "xmlns"
|
|
|
+- msg = "The 'xmlns' prefix must not be declared "+
|
|
|
+- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
|
|
|
+- raise REXML::ParseException.new( msg, @source, self)
|
|
|
++ curr_ns << local_part
|
|
|
++ elsif prefix
|
|
|
++ prefixes << prefix unless prefix == "xml"
|
|
|
+ end
|
|
|
+- curr_ns << local_part
|
|
|
+- elsif prefix
|
|
|
+- prefixes << prefix unless prefix == "xml"
|
|
|
+- end
|
|
|
+
|
|
|
+- if attributes.has_key?(name)
|
|
|
+- msg = "Duplicate attribute #{name.inspect}"
|
|
|
+- raise REXML::ParseException.new(msg, @source, self)
|
|
|
+- end
|
|
|
++ if attributes.has_key?(name)
|
|
|
++ msg = "Duplicate attribute #{name.inspect}"
|
|
|
++ raise REXML::ParseException.new(msg, @source, self)
|
|
|
++ end
|
|
|
+
|
|
|
+- attributes[name] = value
|
|
|
++ attributes[name] = value
|
|
|
++ else
|
|
|
++ message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
|
|
|
++ raise REXML::ParseException.new(message, @source)
|
|
|
++ end
|
|
|
+ end
|
|
|
+- return attributes, closed
|
|
|
+ end
|
|
|
+ end
|
|
|
+ end
|