/
LA_Times.js.in
79 lines (73 loc) · 2.67 KB
/
LA_Times.js.in
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
{
"translatorID" : "f0cc883d-602b-4ad1-b704-8b21a38158f2",
"translatorType" : 4,
"label" : "LA Times",
"creator" : "Erik Hetzner",
"target" : "^http://(www\\.|latimesblogs\\.|lakersblog\\.)?latimes\\.com/",
"minVersion" : "2.0",
"maxVersion" : "",
"priority" : 100,
"inRepository" : true,
"lastUpdated" : "2010-07-01T20:25:59-07:00"
}
//@framework@
function detectWeb(doc, url) { return FW.detectWeb(doc, url); }
function doWeb(doc, url) { return FW.doWeb(doc, url); }
/** Articles */
FW.Scraper({
itemType : 'newspaperArticle',
detect : FW.Xpath('//div[@class="story"]/h1').text(),
title : FW.Xpath('//div[@class="story"]/h1').text().
trim(),
abstractNote : FW.Xpath('/head/meta[@name="description"]/@content').text(),
creators : FW.Xpath('//div[@class="byline"]//span[contains(@class, "byline")]').text().
remove(/^By /).
split(/,/).
trim().
capitalizeTitle().
cleanAuthor("author"),
date : FW.Xpath('//div[@class="byline"]//span[@class="dateString"]').text(),
ISSN : "0458-3035",
publicationTitle : "Los Angeles Times",
section : FW.Xpath('//span[@id="sectionBreadcrumb"]').text().
capitalizeTitle(),
attachments : {
url : FW.Url(),
type : "text/html",
title : "LA Times Snapshot"
}
});
/** Blog posts
*
* Cannot get creator usefully from the junk they give us.
*/
FW.Scraper({
itemType : 'blogPost',
detect : FW.Xpath('//div[@id="blog-header"]'),
title : FW.Xpath('//div[@class="entry"]/h1[@class="entry-header"]').text().
trim(),
date : FW.Xpath('//div[@class="entry"]/div[@class="time"]').text().
match(/^(.* [0-9]{4})/, 1),
publicationTitle : FW.Xpath('//div[@id="blog-header"]/h1').text().
prepend("LA Times Blogs: "),
attachments : {
url : FW.Url(),
type : "text/html",
title : "LA Times Snapshot"
}
});
/** Search results
*
* Not able to get blog results, because they are hosted on different
* domains, JS gives us an error.
*/
FW.MultiScraper({
itemType : "multiple",
detect : FW.Xpath('//ul[@class="adv-results-list"]'),
choices : {
titles : FW.Xpath('//ul[@class="adv-results-list"]//h3').text().
trim(),
urls : FW.Xpath('//ul[@class="adv-results-list"]//h3/a').
key('href').text()
}
});