-
Notifications
You must be signed in to change notification settings - Fork 0
/
FeedMetaDataChecker.R
39 lines (32 loc) · 1.25 KB
/
FeedMetaDataChecker.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
options(stringsAsFactors = FALSE)
library(XML)
library(RCurl)
df = read.csv("urls.txt", sep=" ", header = FALSE)
colnames(df) = c("url", "currentTitle")
df$feedTitle = ""
df$feedDescription = ""
for (i in c(1:nrow(df))){
print(i)
xml.url <- df$url[i]
script <- getURL(xml.url,
.opts=curlOptions(followlocation=TRUE,cookiefile="nosuchfile"),
httpheader = c('User-Agent' = "rss feed checker"))
doc <- xmlParse(script)
# If a default namespace is specified, then we need to update the xpath expressions to be able to handle it
ns = xmlNamespaceDefinitions(doc, simplify = TRUE)
if (any(names(ns) %in% "")){
defaultNSLoc = which(names(ns) == "")
names(ns)[defaultNSLoc] <- "defaultNS"
feedTitle = getNodeSet(doc, "//rdf:RDF/defaultNS:channel/defaultNS:title", namespaces=ns)
description = getNodeSet(doc, "//rdf:RDF/defaultNS:channel/defaultNS:description", namespaces=ns)
} else {
feedTitle = getNodeSet(doc, "//rss/channel/title/text()")
description = getNodeSet(doc, "//channel/description/text()")
}
if (length(feedTitle) > 0){
df$feedTitle[i] = xmlValue(feedTitle[[1]])
}
if (length(description) > 0){
df$feedDescription[i] = xmlValue(description[[1]])
}
}