Skip to content

Commit

Permalink
Merge pull request #152 from seasidesparrow/master
Browse files Browse the repository at this point in the history
major speed improvement with jats parsing; minor fixes to jats/jats_contrib
  • Loading branch information
seasidesparrow committed Jul 7, 2021
2 parents a2d000c + 489a8a2 commit e832127
Show file tree
Hide file tree
Showing 10 changed files with 199 additions and 295 deletions.
27 changes: 22 additions & 5 deletions pyingest/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,16 @@ def get_uat(data,data_dict):
HEX_ENTITY = carr[2].lower()
DEC_ENTITY = carr[3].lower()
for c in NAME_ENTITY.strip().split():
try:
# preserve greek letters, convert all other high-bit chars
eValue = int(DEC_ENTITY.lstrip('&#').rstrip(';'))
if (eValue >= 913 and eValue <= 969) or (eValue >= 192 and eValue <= 382):
ENTITY_DICTIONARY[UNI_ENTITY.strip()] = c.strip()
ENTITY_DICTIONARY[HEX_ENTITY.strip()] = c.strip()
ENTITY_DICTIONARY[DEC_ENTITY.strip()] = c.strip()
else:
ENTITY_DICTIONARY[UNI_ENTITY.strip()] = DEC_ENTITY.strip()
ENTITY_DICTIONARY[HEX_ENTITY.strip()] = DEC_ENTITY.strip()
ENTITY_DICTIONARY[c.strip()] = DEC_ENTITY.strip()
except Exception as e:
print("Error splitting NAME_ENTITY: '%s'" % NAME_ENTITY)
ENTITY_DICTIONARY[UNI_ENTITY.strip()] = DEC_ENTITY.strip()
ENTITY_DICTIONARY[HEX_ENTITY.strip()] = DEC_ENTITY.strip()
else:
print("broken HTML entity:", l.rstrip())
NAME_ENTITY = "xxxxx"
Expand All @@ -216,20 +220,33 @@ def get_uat(data,data_dict):
# ADS-specific translations
# have been added to html5.txt
ENTITY_DICTIONARY['&sim;'] = "~"
ENTITY_DICTIONARY['&#8764;'] = "~"
ENTITY_DICTIONARY['&Tilde;'] = "~"
ENTITY_DICTIONARY['&rsquo;'] = "'"
ENTITY_DICTIONARY['&#8217;'] = "'"
ENTITY_DICTIONARY['&lsquo;'] = "'"
ENTITY_DICTIONARY['&#8216;'] = "'"
ENTITY_DICTIONARY['&nbsp;'] = " "
ENTITY_DICTIONARY['&mdash;'] = "-"
ENTITY_DICTIONARY['&#8212;'] = "-"
ENTITY_DICTIONARY['&ndash;'] = "-"
ENTITY_DICTIONARY['&#8211;'] = "-"
ENTITY_DICTIONARY['&rdquo;'] = '"'
ENTITY_DICTIONARY['&#8221;'] = '"'
ENTITY_DICTIONARY['&ldquo;'] = '"'
ENTITY_DICTIONARY['&#8220;'] = '"'
ENTITY_DICTIONARY['&minus;'] = "-"
ENTITY_DICTIONARY['&#8722;'] = "-"
ENTITY_DICTIONARY['&plus;'] = "+"
ENTITY_DICTIONARY['&#43;'] = "+"
ENTITY_DICTIONARY['&thinsp;'] = " "
ENTITY_DICTIONARY['&#8201;'] = " "
ENTITY_DICTIONARY['&hairsp;'] = " "
ENTITY_DICTIONARY['&#8202;'] = " "
ENTITY_DICTIONARY['&ensp;'] = " "
ENTITY_DICTIONARY['&#8194;'] = " "
ENTITY_DICTIONARY['&emsp;'] = " "
ENTITY_DICTIONARY['&#8195;'] = " "


# ProQuest harvester
Expand Down
141 changes: 0 additions & 141 deletions pyingest/config/html5.dat
Original file line number Diff line number Diff line change
Expand Up @@ -1392,144 +1392,3 @@
𝕩 &xopf; &#x1D569; &#120169; MATHEMATICAL DOUBLE-STRUCK SMALL X
𝕪 &yopf; &#x1D56A; &#120170; MATHEMATICAL DOUBLE-STRUCK SMALL Y
𝕫 &zopf; &#x1D56B; &#120171; MATHEMATICAL DOUBLE-STRUCK SMALL Z
Α &Agr; &#x00391; &#913; GREEK CAPITAL LETTER ALPHA
Β &Bgr; &#x00392; &#914; GREEK CAPITAL LETTER BETA
Γ &Ggr; &#x00393; &#915; GREEK CAPITAL LETTER GAMMA
Δ &Dgr; &#x00394; &#916; GREEK CAPITAL LETTER DELTA
Ε &Egr; &#x00395; &#917; GREEK CAPITAL LETTER EPSILON
Ζ &Zgr; &#x00396; &#918; GREEK CAPITAL LETTER ZETA
Η &EEgr; &#x00397; &#919; GREEK CAPITAL LETTER ETA
Θ &THgr; &#x00398; &#920; GREEK CAPITAL LETTER THETA
Ι &Igr; &#x00399; &#921; GREEK CAPITAL LETTER IOTA
Κ &Kgr; &#x0039A; &#922; GREEK CAPITAL LETTER KAPPA
Λ &Lgr; &#x0039B; &#923; GREEK CAPITAL LETTER LAMDA
Μ &Mgr; &#x0039C; &#924; GREEK CAPITAL LETTER MU
Ν &Ngr; &#x0039D; &#925; GREEK CAPITAL LETTER NU
Ξ &Xgr; &#x0039E; &#926; GREEK CAPITAL LETTER XI
Ο &Ogr; &#x0039F; &#927; GREEK CAPITAL LETTER OMICRON
Π &Pgr; &#x003A0; &#928; GREEK CAPITAL LETTER PI
Ρ &Rgr; &#x003A1; &#929; GREEK CAPITAL LETTER RHO
Σ &Sgr; &#x003A3; &#931; GREEK CAPITAL LETTER SIGMA
Τ &Tgr; &#x003A4; &#932; GREEK CAPITAL LETTER TAU
Υ &Ugr; &#x003A5; &#933; GREEK CAPITAL LETTER UPSILON
Φ &PHgr; &#x003A6; &#934; GREEK CAPITAL LETTER PHI
Χ &KHgr; &#x003A7; &#935; GREEK CAPITAL LETTER CHI
Ψ &PSgr; &#x003A8; &#936; GREEK CAPITAL LETTER PSI
Ω &OHgr; &#x003A9; &#937; GREEK CAPITAL LETTER OMEGA
α &agr; &#x003B1; &#945; GREEK SMALL LETTER ALPHA
β &bgr; &#x003B2; &#946; GREEK SMALL LETTER BETA
γ &ggr; &#x003B3; &#947; GREEK SMALL LETTER GAMMA
δ &dgr; &#x003B4; &#948; GREEK SMALL LETTER DELTA
ε &egr; &#x003B5; &#949; GREEK SMALL LETTER EPSILON
ζ &zgr; &#x003B6; &#950; GREEK SMALL LETTER ZETA
η &eegr; &#x003B7; &#951; GREEK SMALL LETTER ETA
θ &thgr; &#x003B8; &#952; GREEK SMALL LETTER THETA
ι &igr; &#x003B9; &#953; GREEK SMALL LETTER IOTA
κ &kgr; &#x003BA; &#954; GREEK SMALL LETTER KAPPA
λ &lgr; &#x003BB; &#955; GREEK SMALL LETTER LAMDA
μ &mgr; &#x003BC; &#956; GREEK SMALL LETTER MU
ν &ngr; &#x003BD; &#957; GREEK SMALL LETTER NU
ξ &xgr; &#x003BE; &#958; GREEK SMALL LETTER XI
ο &ogr; &#x003BF; &#959; GREEK SMALL LETTER OMICRON
π &pgr; &#x003C0; &#960; GREEK SMALL LETTER PI
ρ &rgr; &#x003C1; &#961; GREEK SMALL LETTER RHO
ς &sfgr; &#x003C2; &#962; GREEK SMALL LETTER FINAL SIGMA
σ &sgr; &#x003C3; &#963; GREEK SMALL LETTER SIGMA
τ &tgr; &#x003C4; &#964; GREEK SMALL LETTER TAU
υ &ugr; &#x003C5; &#965; GREEK SMALL LETTER UPSILON
φ &phgr; &#x003C6; &#966; GREEK SMALL LETTER PHI
χ &khgr; &#x003C7; &#967; GREEK SMALL LETTER CHI
ψ &psgr; &#x003C8; &#968; GREEK SMALL LETTER PSI
ω &ohgr; &#x003C9; &#969; GREEK SMALL LETTER OMEGA
Ё &IOcy; &#x00401; &#1025; capital IO, Russian
Ђ &DJcy; &#x00402; &#1026; capital DJE, Serbian
Ѓ &GJcy; &#x00403; &#1027; capital GJE Macedonian
Є &Jukcy; &#x00404; &#1028; capital JE, Ukrainian
Ѕ &DScy; &#x00405; &#1029; capital DSE, Macedonian
І &Iukcy; &#x00406; &#1030; capital I, Ukrainian
Ї &YIcy; &#x00407; &#1031; capital YI, Ukrainian
Ј &Jsercy; &#x00408; &#1032; capital JE, Serbian
Љ &LJcy; &#x00409; &#1033; capital LJE, Serbian
Њ &NJcy; &#x0040A; &#1034; capital NJE, Serbian
Ћ &TSHcy; &#x0040B; &#1035; capital TSHE, Serbian
Ќ &KJcy; &#x0040C; &#1036; capital KJE, Macedonian
Ў &Ubrcy; &#x0040E; &#1038; capital U, Byelorussian
Џ &DZcy; &#x0040F; &#1039; capital dze, Serbian
А &Acy; &#x00410; &#1040; capital A, Cyrillic
Б &Bcy; &#x00411; &#1041; capital BE, Cyrillic
В &Vcy; &#x00412; &#1042; capital VE, Cyrillic
Г &Gcy; &#x00413; &#1043; capital GHE, Cyrillic
Д &Dcy; &#x00414; &#1044; capital DE, Cyrillic
Е &IEcy; &#x00415; &#1045; capital IE, Cyrillic
Ж &ZHcy; &#x00416; &#1046; capital ZHE, Cyrillic
З &Zcy; &#x00417; &#1047; capital ZE, Cyrillic
И &Icy; &#x00418; &#1048; capital I, Cyrillic
Й &Jcy; &#x00419; &#1049; capital short I, Cyrillic
К &Kcy; &#x0041A; &#1050; capital KA, Cyrillic
Л &Lcy; &#x0041B; &#1051; capital EL, Cyrillic
М &Mcy; &#x0041C; &#1052; capital EM, Cyrillic
Н &Ncy; &#x0041D; &#1053; capital EN, Cyrillic
О &Ocy; &#x0041E; &#1054; capital O, Cyrillic
П &Pcy; &#x0041F; &#1055; capital PE, Cyrillic
Р &Rcy; &#x00420; &#1056; capital ER, Cyrillic
С &Scy; &#x00421; &#1057; capital ES, Cyrillic
Т &Tcy; &#x00422; &#1058; capital TE, Cyrillic
У &Ucy; &#x00423; &#1059; capital U, Cyrillic
Ф &Fcy; &#x00424; &#1060; capital EF, Cyrillic
Х &KHcy; &#x00425; &#1061; capital HA, Cyrillic
Ц &TScy; &#x00426; &#1062; capital TSE, Cyrillic
Ч &CHcy; &#x00427; &#1063; capital CHE, Cyrillic
Ш &SHcy; &#x00428; &#1064; capital SHA, Cyrillic
Щ &SHCHcy; &#x00429; &#1065; capital SHCHA, Cyrillic
Ъ &HARDcy; &#x0042A; &#1066; capital HARD sign, Cyrillic
Ы &Ycy; &#x0042B; &#1067; capital YERU, Cyrillic
Ь &SOFTcy; &#x0042C; &#1068; capital SOFT sign, Cyrillic
Э &Ecy; &#x0042D; &#1069; capital E, Cyrillic
Ю &YUcy; &#x0042E; &#1070; capital YU, Cyrillic
Я &YAcy; &#x0042F; &#1071; capital YA, Cyrillic
а &acy; &#x00430; &#1072; small a, Cyrillic
б &bcy; &#x00431; &#1073; small be, Cyrillic
в &vcy; &#x00432; &#1074; small ve, Cyrillic
г &gcy; &#x00433; &#1075; small ghe, Cyrillic
д &dcy; &#x00434; &#1076; small de, Cyrillic
е &iecy; &#x00435; &#1077; small ie, Cyrillic
ж &zhcy; &#x00436; &#1078; small zhe, Cyrillic
з &zcy; &#x00437; &#1079; small ze, Cyrillic
и &icy; &#x00438; &#1080; small i, Cyrillic
й &jcy; &#x00439; &#1081; small short i, Cyrillic
к &kcy; &#x0043A; &#1082; small ka, Cyrillic
л &lcy; &#x0043B; &#1083; small el, Cyrillic
м &mcy; &#x0043C; &#1084; small em, Cyrillic
н &ncy; &#x0043D; &#1085; small en, Cyrillic
о &ocy; &#x0043E; &#1086; small o, Cyrillic
п &pcy; &#x0043F; &#1087; small pe, Cyrillic
р &rcy; &#x00440; &#1088; small er, Cyrillic
с &scy; &#x00441; &#1089; small es, Cyrillic
т &tcy; &#x00442; &#1090; small te, Cyrillic
у &ucy; &#x00443; &#1091; small u, Cyrillic
ф &fcy; &#x00444; &#1092; small ef, Cyrillic
х &khcy; &#x00445; &#1093; small ha, Cyrillic
ц &tscy; &#x00446; &#1094; small tse, Cyrillic
ч &chcy; &#x00447; &#1095; small che, Cyrillic
ш &shcy; &#x00448; &#1096; small sha, Cyrillic
щ &shchcy; &#x00449; &#1097; small shcha, Cyrillic
ъ &hardcy; &#x0044A; &#1098; small hard sign, Cyrillic
ы &ycy; &#x0044B; &#1099; small yeru, Cyrillic
ь &softcy; &#x0044C; &#1100; small soft sign, Cyrillic
э &ecy; &#x0044D; &#1101; small e, Cyrillic
ю &yucy; &#x0044E; &#1102; small yu, Cyrillic
я &yacy; &#x0044F; &#1103; small ya, Cyrillic
ё &iocy; &#x00451; &#1105; small io, Russian
ђ &djcy; &#x00452; &#1106; small dje, Serbian
ѓ &gjcy; &#x00453; &#1107; small gje, Macedonian
є &jukcy; &#x00454; &#1108; small je, Ukrainian
ѕ &dscy; &#x00455; &#1109; small dse, Macedonian
і &iukcy; &#x00456; &#1110; small i, Ukrainian
ї &yicy; &#x00457; &#1111; small yi, Ukrainian
ј &jsercy; &#x00458; &#1112; small je, Serbian
љ &ljcy; &#x00459; &#1113; small lje, Serbian
њ &njcy; &#x0045A; &#1114; small nje, Serbian
ћ &tshcy; &#x0045B; &#1115; small tshe, Serbian
ќ &kjcy; &#x0045C; &#1116; small kje Macedonian
ў &ubrcy; &#x0045E; &#1118; small u, Byelorussian
џ &dzcy; &#x0045F; &#1119; small dze, Serbian
19 changes: 13 additions & 6 deletions pyingest/parsers/entity_convert.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import re
import namedentities
from namedentities import named_entities
from pyingest.config import config

re_ents = re.compile(r'&[a-z0-9]+;|&#[0-9]{1,6};|&#x[0-9a-fA-F]{1,6};')


class EntityConverter(object):

Expand All @@ -11,9 +13,14 @@ def __init__(self):
self.ent_dict = config.ENTITY_DICTIONARY

def convert(self):
o = namedentities.named_entities(self.input_text)
# ox = o
for k, v in self.ent_dict.items():
# ox = re.sub(k, v, ox)
o = re.sub(k, v, o)
o = named_entities(self.input_text)
oents = list(dict.fromkeys(re.findall(re_ents,o)))

for e in oents:
try:
enew = self.ent_dict[e]
except:
pass
else:
o = re.sub(e, enew, o)
self.output_text = o

0 comments on commit e832127

Please sign in to comment.