From 40010607af1652fa884e5ebeeea686a52e74fbab Mon Sep 17 00:00:00 2001 From: ayalgelles Date: Sat, 20 Oct 2012 20:05:40 +0200 Subject: [PATCH 1/3] serve facebookexternalhit user agent with spiderable --- packages/spiderable/spiderable.js | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/packages/spiderable/spiderable.js b/packages/spiderable/spiderable.js index 7d97505e62..57ae6746eb 100644 --- a/packages/spiderable/spiderable.js +++ b/packages/spiderable/spiderable.js @@ -2,23 +2,22 @@ var fs = __meteor_bootstrap__.require('fs'); var spawn = __meteor_bootstrap__.require('child_process').spawn; var querystring = __meteor_bootstrap__.require('querystring'); + var urlParser = __meteor_bootstrap__.require('url'); var app = __meteor_bootstrap__.app; // how long to let phantomjs run before we kill it var REQUEST_TIMEOUT = 15*1000; app.use(function (req, res, next) { - if (/\?.*_escaped_fragment_=/.test(req.url)) { - // get escaped fragment out of the url. - var idx = req.url.indexOf('?'); - var preQuery = req.url.substr(0, idx); - var queryStr = req.url.substr(idx + 1); - var parsed = querystring.parse(queryStr); - delete parsed['_escaped_fragment_']; - var newQuery = querystring.stringify(parsed); - var newPath = preQuery + (newQuery ? "?" + newQuery : ""); - var url = "http://" + req.headers.host + newPath; - + if (/\?.*_escaped_fragment_=/.test(req.url) || req.headers['user-agent'].indexOf('facebookexternalhit') !== -1) { + // reassemblying url without escaped fragment if exists + var parsedUrl = urlParser.parse(req.url); + var parsedQuery = querystring.parse(parsedUrl.query); + delete parsedQuery['_escaped_fragment_']; + var newQuery = querystring.stringify(parsedQuery); + var newPath = parsedUrl.pathname + (newQuery ? ('?' + newQuery) : ''); + var url = "http://" + req.headers.host + newPath; + // run phantomjs // // Use '/dev/stdin' to avoid writing to a temporary file. Can't @@ -38,7 +37,7 @@ }); cp.on('exit', function (code) { - if (0 === code && //i.test(data)) { + if (0 === code && / Date: Sun, 28 Oct 2012 18:10:48 +0200 Subject: [PATCH 2/3] using a list of agents to serve --- packages/spiderable/spiderable.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/spiderable/spiderable.js b/packages/spiderable/spiderable.js index 57ae6746eb..0840c12f00 100644 --- a/packages/spiderable/spiderable.js +++ b/packages/spiderable/spiderable.js @@ -4,13 +4,14 @@ var querystring = __meteor_bootstrap__.require('querystring'); var urlParser = __meteor_bootstrap__.require('url'); var app = __meteor_bootstrap__.app; + var agents = ['facebookexternalhit']; // list of bot-agents to serve (possibly make this list configurable by user) // how long to let phantomjs run before we kill it var REQUEST_TIMEOUT = 15*1000; app.use(function (req, res, next) { - if (/\?.*_escaped_fragment_=/.test(req.url) || req.headers['user-agent'].indexOf('facebookexternalhit') !== -1) { - // reassemblying url without escaped fragment if exists + if (/\?.*_escaped_fragment_=/.test(req.url) || agents.indexOf(req.headers['user-agent']) !== -1) { + // reassembling url without escaped fragment if exists var parsedUrl = urlParser.parse(req.url); var parsedQuery = querystring.parse(parsedUrl.query); delete parsedQuery['_escaped_fragment_']; From cb0d7238a12b74f63c4aaf5844bcba54b62d2d72 Mon Sep 17 00:00:00 2001 From: Nick Martin Date: Mon, 29 Oct 2012 19:11:13 -0700 Subject: [PATCH 3/3] Convert to use regexps for user agents, so we can match multiple Facebook agents. --- packages/spiderable/spiderable.js | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/packages/spiderable/spiderable.js b/packages/spiderable/spiderable.js index 0840c12f00..30e81e28e6 100644 --- a/packages/spiderable/spiderable.js +++ b/packages/spiderable/spiderable.js @@ -4,21 +4,29 @@ var querystring = __meteor_bootstrap__.require('querystring'); var urlParser = __meteor_bootstrap__.require('url'); var app = __meteor_bootstrap__.app; - var agents = ['facebookexternalhit']; // list of bot-agents to serve (possibly make this list configurable by user) + + // list of bot user agents that we want to serve statically, but do + // not obey the _escaped_fragment_ protocol. The page is served + // statically to any client whos user agent matches any of these + // regexps. (possibly make this list configurable by user). + var AGENTS = [/^facebookexternalhit/]; // how long to let phantomjs run before we kill it var REQUEST_TIMEOUT = 15*1000; app.use(function (req, res, next) { - if (/\?.*_escaped_fragment_=/.test(req.url) || agents.indexOf(req.headers['user-agent']) !== -1) { + if (/\?.*_escaped_fragment_=/.test(req.url) || + _.any(AGENTS, function (re) { + return re.test(req.headers['user-agent']); })) { + // reassembling url without escaped fragment if exists var parsedUrl = urlParser.parse(req.url); var parsedQuery = querystring.parse(parsedUrl.query); delete parsedQuery['_escaped_fragment_']; var newQuery = querystring.stringify(parsedQuery); - var newPath = parsedUrl.pathname + (newQuery ? ('?' + newQuery) : ''); + var newPath = parsedUrl.pathname + (newQuery ? ('?' + newQuery) : ''); var url = "http://" + req.headers.host + newPath; - + // run phantomjs // // Use '/dev/stdin' to avoid writing to a temporary file. Can't