/
utility.py
102 lines (85 loc) · 2.14 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# Load Stopwords
def loadFile(file):
f = open(file, encoding='utf-8')
stopwords = f.read().splitlines()
return stopwords
# Match Stopwords
def checkInDict(word, dictionary):
inDict = False
for wrd in dictionary:
if word == wrd:
inDict = True
break
return inDict
# Get Sentences Functions
def getSentences(passage, delimiter = ['۔']):
sentence = ['']
numSentences = 0
for char in passage:
if char in delimiter:
numSentences = numSentences + 1
sentence.append('')
else:
sentence[numSentences] = sentence[numSentences] + char
return sentence
# Get Keywords
def removeStopWords(tokens, stopwords):
keywords = [['']]
u = 0
first = True
for i in range(len(tokens)):
for j in range(len(tokens[i])):
if not(checkInDict(tokens[i][j], stopwords)):
if first:
keywords[u][0] = tokens[i][j]
first = False
else:
keywords[u].append(tokens[i][j])
if not(first):
# End of for j
u = u + 1
keywords.append([''])
first = True
# End of For Loop
keywords.pop()
clearExtraChar(keywords)
for i in range(len(keywords)):
keywords[i] = removeDuplicates(keywords[i])
return keywords
# Get Tokens
def tokenize(sentences, alphabets):
tokens = [['']]
delimiters = [' ', '،', '-', '–']
for i in range(len(sentences)):
numWords = 0
for char in sentences[i]:
if not (char in delimiters):
tokens[i][numWords] = tokens[i][numWords] + char
else:
numWords = numWords + 1
tokens[i].append('')
tokens.append([''])
tokens.pop()
return tokens
# Clear Function
def clearExtraChar(keywords):
removeList = ['']
for i in range(len(keywords)):
keywords[i] = [j for j in keywords[i] if j not in removeList]
return keywords
# Removing Duplicated from the list
def removeDuplicates(x):
return list(dict.fromkeys(x))
# Return Year
def checkIfYear(word):
if (len(word) == 5):
if (word[0] == 'ء'):
word = word[1:]
elif (word[-1] == 'ء'):
word = word[:-1]
if (word.isnumeric()):
return True
return False
elif ((len(word) == 4) & (word.isnumeric())):
return True
return False