-
Notifications
You must be signed in to change notification settings - Fork 1
/
convertGrammar.py
114 lines (89 loc) · 2.87 KB
/
convertGrammar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from __future__ import division
import sys
import cPickle as pickle
from gzip import GzipFile
from AIMA import DefaultDict
from topdownParser import Grammar, Rule
def readLambdas(ff):
lambdas = {}
for line in ff:
if not line.strip():
return lambdas
fields = line.strip().split()
(nt, prob) = fields
prob = float(prob)
lambdas[nt] = prob
return lambdas
def readProductionTable(ff):
ntToWord = DefaultDict({})
ct = 0
for line in ff:
if not line.strip():
return ntToWord
if ct % 1000 == 0:
print >>sys.stderr, "read", ct, "..."
ct += 1
fields = line.strip().split()
(prob, nt, arrow, word) = fields
assert(arrow == "->")
ntToWord[nt][word] = float(prob)
return ntToWord
if __name__ == "__main__":
(grammar, lexicon, lookahead, out) = sys.argv[1:]
print "Grammar:", grammar, "Lexicon:", lexicon, "Lookahead:", lookahead
rules = DefaultDict([])
ct = 0
for line in file(grammar):
if ct % 1000 == 0:
print >>sys.stderr, ct, "..."
ct += 1
fields = line.strip().split()
(lhs, arrow, rhs1) = fields[0:3]
assert(arrow == "->")
if len(fields) == 5:
rhs = [rhs1, fields[3]]
prob = fields[4]
elif len(fields) == 4:
rhs = [rhs1,]
prob = fields[3]
prob = float(prob)
rule = Rule()
rule.setup(lhs, rhs, prob)
if [rule.lhs,] == rule.rhs and rule.prob == 1.0:
print >>sys.stderr, "Warning: X->X", rule.lhs, rule.rhs
else:
rules[rule.lhs].append(rule)
ct = 0
for line in file(lexicon):
if ct % 1000 == 0:
print >>sys.stderr, ct, "..."
ct += 1
fields = line.strip().split()
(pos, word) = fields[0:2]
lst = eval(" ".join(fields[2:]))
for num,prob in enumerate(lst):
preterm = "%s_%d" % (pos, num)
rule = Rule()
rule.setup(preterm, [word,], float(prob))
if [rule.lhs,] == rule.rhs and rule.prob == 1.0:
print >>sys.stderr, "Warning: X->X", rule.lhs, rule.rhs
else:
rules[rule.lhs].append(rule)
grammar = Grammar(rules)
if lookahead.endswith(".gz"):
look = GzipFile(lookahead)
else:
look = file(lookahead)
lambdas = readLambdas(look)
ntToPos = readProductionTable(look)
ntToWord = readProductionTable(look)
posToWord = readProductionTable(look)
grammar.setLookahead(lambdas, ntToPos, ntToWord, posToWord)
print >>sys.stderr, "dumping"
if out.endswith(".gz"):
outfile = GzipFile(out, 'wb')
else:
outfile = file(out, 'wb')
pickler = pickle.Pickler(outfile, protocol=-1)
pickler.fast = 1
pickler.dump(grammar)