-
Notifications
You must be signed in to change notification settings - Fork 9
/
build_meta
executable file
·249 lines (219 loc) · 8.11 KB
/
build_meta
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env ruby
# build_meta --- generator of parser tables for SPARQL::Grammar::Parser
# Derived from:
# http://www.w3.org/2000/10/swap/grammar/predictiveParser.py
# - predictiveParser.py, Tim Berners-Lee, 2004
#
require 'rubygems'
$:.unshift(File.expand_path(File.join(File.dirname(__FILE__), "..", 'lib')))
require 'rdf/n3'
require 'getoptlong'
# Build rdf/n3/parser/meta.rb from http://www.w3.org/2000/10/swap/grammar/n3-selectors.n3
class BNF < RDF::Vocabulary("http://www.w3.org/2000/10/swap/grammar/bnf#"); end
class REGEX < RDF::Vocabulary("http://www.w3.org/2000/10/swap/grammar/regex#"); end
class N3 < RDF::Vocabulary("http://www.w3.org/2000/10/swap/grammar/n3#"); end
class PredictiveParser
attr_accessor :already, :agenda, :errors, :literalTerminals, :branchTable, :tokenRegexps
attr_accessor :graph
def initialize
@already = []
@agenda = []
@errors = []
@literalTerminals = {}
@branchTable = {}
@tokenRegexps = {}
end
def parse(file)
progress("Loading " + file)
@graph = RDF::Graph.load(file)
progress("Loaded #{@graph.count} statements.")
end
def recordError(str)
errors << str
"##### ERROR: #{str}"
end
def progress(str); puts(str); end
def chatty(str); progress(str) if $verbose; end
def runProduction(lhs)
doProduction(lhs)
while !@agenda.empty?
x = @agenda.shift
@already << x
doProduction(x)
end
if !@errors.empty?
progress("###### FAILED with #{errors.length} errors.")
@errors.each {|s| progress(" #{s}")}
exit(-2)
else
progress("Ok for predictive parsing")
end
end
# Generate branch tables for one production
def doProduction(lhs)
if lhs == BNF.void
progress("\nvoid")
return
end
if lhs == BNF.eof
progress( "\nEOF")
return
end
if lhs.is_a?(RDF::Literal)
literalTerminals[lhs.value()] = 1
return
end
branchDict = {}
rhs = graph.first_object(subject: lhs, predicate: BNF.matches)
if rhs
chatty("\nToken #{lhs} matches regexp #{rhs}")
tokenRegexps[lhs] = rhs.value
cc = graph.query({subject: lhs, predicate: BNF.canStartWith})
progress(recordError("No record of what token #{lhs} can start with")) if cc.empty?
cc.each {|statement| chatty(" Can start with: #{statement.object}")}
return
end
rhs = graph.first_object(subject: lhs, predicate: BNF.mustBeOneSequence)
unless rhs
progress(recordError("No definition of #{lhs}"))
# raise RuntimeError("No definition of %s in\n %s" %(`lhs`, `g`))
return
end
options = rhs
progress("\nProduction #{lhs} :: #{options}")
graph.query({subject: lhs, predicate: BNF.canPrecede}) do |statement|
chatty(" Can precede '#{statement.object}'")
end
graph.query({subject: lhs, predicate: BNF.branch}) do |statement|
branch = statement.object
sequence = graph.first_object(subject: statement.object, predicate: BNF.sequence)
option = RDF::List.new(subject: sequence, graph: graph).to_a
progress(" option: #{option}")
option.each do |part|
agenda << part unless already.include?(part) || agenda.include?(part)
end
conditions = graph.query({subject: branch, predicate: BNF.condition}).map(&:object)
if conditions.empty?
progress(recordError("NO SELECTOR for #{lhs} option #{option}"))
if option.empty?
# Void case - the tricky one
graph.pattern(subject: lhs, predicate: BNF.canPrecede) do |st|
progress(" Can precede #{st.object}")
end
end
end
progress(" Conditions: #{conditions.to_a.map(&:to_s)}")
conditions.each do |str1|
if branchDict.has_key?(str1)
progress(
"Conflict: #{str1} is also the condition for #{branchDict[str1]}")
end
branchDict[str1] = option
end
end
branchDict.keys.each do |str1|
branchDict.keys.each do |str2|
s1, s2 = str1.to_s, str2.to_s
if (s1.index(s2) == 0 || s2.index(s1) == 0) && branchDict[str1] != branchDict[str2]
progress("WARNING: for #{lhs}, #{str1} indicates #{branchDict[str1]}, but #{str2} indicates #{branchDict[str2]}")
end
end
end
branchTable[lhs] = branchDict
end
def litOrNot(value)
(value.is_a?(RDF::Literal) ? "" : ":") + value.to_s.dump
end
def outputBranchTable(io, indent = 0)
ind0 = ' ' * indent
ind1 = ind0 + ' '
ind2 = ind1 + ' '
ind3 = ind2 + ' '
io.puts "#{ind0}BRANCHES = {"
branchTable.keys.sort_by(&:to_s).each do |prod|
# Special case double, integer, and decimal to output just a numericliteral, due to a parser conflict
next if prod.to_s =~ /numericliteral/
io.puts "#{ind1}#{litOrNot(prod)} => {"
branchTable[prod].keys.sort_by(&:to_s).each do |term|
list = branchTable[prod][term].map {|t2| litOrNot(t2)}.join(",\n#{ind3}")
io.puts "#{ind2}#{litOrNot(term)} => [#{list}],"
end
io.puts "#{ind1}},"
end
io.puts "#{ind0}}\n"
end
def outputRegexpTable(io, indent = 0)
ind0 = ' ' * indent
ind1 = ind0 + ' '
io.puts "#{ind0}REGEXPS = {"
tokenRegexps.keys.sort_by(&:to_s).each do |prod|
# Special case double, integer, and decimal to output just a numericliteral, due to a parser conflict
next if prod.to_s =~ /(integer|double|decimal)/
io.puts "#{ind1}#{litOrNot(prod)} => Regexp.compile(" +
case prod.to_s
when /barename/ then %q(%(^[#{BARENAME_START}][#{BARENAME_TAIL}]*))
when /explicituri/ then %q("^<[^>]*>")
when /langcode/ then %q("^[a-zA-Z]+(-[a-zA-Z0-9]+)*")
when /prefix/ then %q(%(^([#{BARENAME_START}][#{BARENAME_TAIL}]*)?:))
when /qname/ then %q(%(^(([#{BARENAME_START}][#{BARENAME_TAIL}]*)?:)?([#{BARENAME_START}][#{BARENAME_TAIL}]*)?))
when /variable/ then %q(%(^\\\\?[#{BARENAME_START}][#{BARENAME_TAIL}]*))
else tokenRegexps[prod].dump
end + "),"
end
io.puts "\n#{ind1}# Hack to replace integer|double|decimal with numericliteral"
io.puts "#{ind1}#{litOrNot(N3.numericliteral)} => Regexp.compile(" + %q(%(^[-+]?[0-9]+(\\\\.[0-9]+)?(e[-+]?[0-9]+)?)) + ")"
io.puts "#{ind0}}\n"
end
end
$verbose = false
$debug = false
grammarFile = File.expand_path(File.join(File.dirname(__FILE__), "../lib/rdf/n3/reader/n3-selectors.n3"))
start = N3.document
output = STDOUT
opts = GetoptLong.new(
["--debug", GetoptLong::NO_ARGUMENT],
["--verbose", GetoptLong::NO_ARGUMENT],
["--grammar", GetoptLong::REQUIRED_ARGUMENT],
["--start", GetoptLong::REQUIRED_ARGUMENT],
["--output", "-o", GetoptLong::REQUIRED_ARGUMENT],
["--help", "-?", GetoptLong::NO_ARGUMENT]
)
opts.each do |opt, arg|
case opt
when '--verbose' then $verbose = true
when '--debug' then $debug = true
when '--grammar' then grammarFile = arg
when '--as' then parseAs = arg
when '--output' then output = File.open(arg, "w")
when '--help'
puts %(Usage: build_meta --grammar=file --as=uri [--output=file]
--grammar=file This is the RDF augmented grammar
--start=uri This is the URI of the production as which the document
is to be parsed
--output=file Where to save output
)
exit(0)
end
end
pp = PredictiveParser.new
pp.parse(grammarFile)
pp.runProduction(start)
unless output == STDOUT
output.puts "# This file is automatically generated by #{__FILE__}"
output.puts "# Branch and Regexp tables derived from #{grammarFile}"
output.puts "module RDF::N3::Meta"
end
pp.outputBranchTable(output, 1)
output.puts %q(
if RUBY_VERSION >= "1.9.0"
BARENAME_START = "A-Z_a-z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c-\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd\u{10000}-\u{effff}"
BARENAME_TAIL = "0-9#{BARENAME_START}\u00b7\u0300-\u036f\u203f-\u2040\\\\-"
else
BARENAME_START = "A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\xff"
BARENAME_TAIL = "0-9#{BARENAME_START}\xb7\\\\-"
end
)
pp.outputRegexpTable(output, 1)
unless output == STDOUT
output.puts "end"
end