# FS - Facebook Spider
# V 0.2
# Software created by Sean Colyer, contributions from Joe Kubiak
# http://seancolyer.com
# sean@seancolyer.com
#
# Copyright (C) 2008-2009 Sean Colyer
#
# This package is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This package is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this package; if not, write to the Free Software
# Foundation, 51 Franklin Street, Fifth Floor, Boston, MA, 02110-1301
# USA
#for some reason creating response.read() variables seemed needed or else accessing the response.read() multiple times tended to lead to problems.
#the process is not threaded and runs through the pictures in the order they are on facebook. This is a more accurate representation of a how a human would view the photos.
#Regular expressions could probably be used to search through the page more effectively.
#FIXME -- FB no longer uses many of the abbreviated subdomains i.e. uva.facebook.com , which should simplify a bit of the code
import os, urllib,urllib2,cookielib,getpass
#Change these settings below as desired
fakeHeader = {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; AOL 9.0; Windows NT 5.1)'} #Use mozilla header to avoid obvious script appearance
dirStyle = 1; #1 for "Name FacebookID#" style directory , anything else for just "Name" directory , to avoid multiple people of the same name.
rootDir ='' #if not null, needs to end with /, defaults storing pictures to wherever this file is saved.
profile = 'profile'
user = 'user'
allPics = 'all'
album = 'album'
#acquire users ID based on their homepage
def getID (homeUrl):
homeReq = urllib2.Request(homeUrl,None,fakeHeader)
homeResponse = urllib2.urlopen(homeReq)
homePage = homeResponse.read()
#New Facebook Style:
fbid = homePage[homePage.find('profile.php?id=')+15:-1]
fbid = fbid[0:fbid.find('"><')] #I don't like this because I think its possible that someone add to their homepage that would get the first hit. Prefer to search for "ref=name"
logoutID = homePage[homePage.find('logout.php?h=')+13:-1]
logoutID = logoutID[0:logoutID.find('&')]
print logoutID, fbid
return logoutID, fbid
#cleanly logs out off facebook and saves cookiefile
def logout(cookieFile,logoutID):
print logoutID
logoutBase = 'http://www.facebook.com/logout.php?h='
logoutRequest = urllib2.Request(logoutBase + logoutID,None,fakeHeader)
urllib2.urlopen(logoutRequest)
cookieJar.save(cookieFile)
print 'You are now logged out.'
#acquires name from someones profile, either end user or searched user
def getName():
profileUrl = shortUrl + 'profile.php?id=' + fbid
print profileUrl
profileReq = urllib2.Request(profileUrl,None, fakeHeader)
profileResponse = urllib2.urlopen(profileReq)
profilePage = profileResponse.read()
name = profilePage[profilePage.find('
'):profilePage.find('/title>')]
name = name[18:-1]
print name
return name
def getPhotos(dirName, view, albumid = -3, albumName = None):
#facebook uses only slight differences between different styles of photo, so a single method can be created for different types -- 2009/03 I think even closer together now
if view is allPics or view is user:
picBaseUrl = shortUrl + 'profile.php?id=' + fbid +'&v=photos'
#OLD STYLE: picBaseUrl = shortUrl + 'photo_search.php?id=' + fbid + '&view=' + view
lcdir = '/General Pictures'
if view is album:
picBaseUrl = shortUrl + 'album.php?aid=' + albumid + '&id=' + fbid
albumName = albumName.replace('/','-')
lcdir = '/' + albumName
if view is profile:
picBaseUrl = shortUrl + 'album.php?profile&id='+ fbid
lcdir = '/Profile Pictures'
#start the search
#print picBaseUrl
picBaseReq = urllib2.Request(picBaseUrl,None,fakeHeader)
picBaseResponse = urllib2.urlopen(picBaseReq)
picBasePage = picBaseResponse.read()
#print picBasePage
if view is profile or view is album:
numPics = picBasePage[picBasePage.find('"summary">')+14:-1]
numPics = numPics[0:numPics.find('<')]
numPics = (numPics.split())[-1]
if not numPics.isdigit(): #special case if there are fewer than 20, I believe, as there are not multiple pages
numPics = picBasePage[picBasePage.find('"summary">')+14:-1]
numPics = numPics[0:numPics.find(' photos')]
picPerPage = 20
if view is allPics or view is user:
numPics = picBasePage[picBasePage.find('class="caption">')+16:-1]
numPics = numPics[0:numPics.find(' photos')]
picPerPage = 15
if not numPics.isdigit():
print "No photos under the " + view + " view."
return
picBasePage = picBasePage[picBasePage.find('UIPhotoGrid_Container'):-1] #Ignore extra pics sent by FB
#Loop to get individual pictures
temp=0
while temp< int(numPics):
print str(temp + 1) + ' / ' + str(numPics)
if temp > 0 and temp % picPerPage == 0:
#once again, slight variations for different styles of pictures.
if view is allPics or view is user:
#picBaseUrl = shortUrl + 'photo_search.php?page=' + str(int(temp/20)+1) + '&id=' + fbid + '&view=' + view
picBaseUrl = shortUrl + 'profile.php?id=' + fbid + '&v=photos&so=' + str(int(temp))
if view is album:
picBaseUrl = shortUrl + 'album.php?aid=' + str(albumid) + '&page=' + str(int(temp/picPerPage)+1) + '&id=' + fbid
#print picBaseUrl
if view is profile:
picBaseUrl = shortUrl + 'album.php?page=' + str(int(temp/picPerPage)+1) + '&aid=-3&id=' + fbid
#print picBaseUrl
picBaseReq = urllib2.Request(picBaseUrl,None,fakeHeader)
picBaseResponse = urllib2.urlopen(picBaseReq)
picBasePage = picBaseResponse.read()
#print picBaseUrl
picBasePage = picBasePage[picBasePage.find('UIPhotoGrid_Container'):-1]
if picBasePage.find('photo.php?pid=') is -1:
break
picBasePage=picBasePage[picBasePage.find('photo.php?pid=')+14:-1]
picUrlEnd = picBasePage[0:picBasePage.find('"')] #has picid, subj and id which are now required
picid = picUrlEnd[0:picUrlEnd.find('&op')]
picUrlEnd = picUrlEnd.replace('&','&')
#Get and save photo
dirName = rootDir + dirName #set active directory so it includes full path up to the user
if not os.path.isfile(dirName + lcdir + '/' + picid[0:8] + '.jpg'): #skips over pictures which have already been saved under this name to this directory, to speed up re-running
picUrl = shortUrl + 'photo.php?pid=' + picUrlEnd
#print picUrl
picRequest = urllib2.Request(picUrl,None,fakeHeader)
picResponse = urllib2.urlopen(picRequest)
picPage = picResponse.read()
#print picUrl
picPage = picPage[picPage.find('id="myphotolink">
')+10:-1]
numAlbums = int(numAlbums[0:numAlbums.find('Photo')])
temp = 0
#print albumUrl
while temp < numAlbums:
if temp > 0 and temp % 5 == 0:
albumUrl = shortUrl + 'photos.php?id=' + fbid + '&s=' + str(temp)
albumReq = urllib2.Request(albumUrl,None,fakeHeader)
albumResponse = urllib2.urlopen(albumReq)
albumPage = albumResponse.read()
albumPage = albumPage[albumPage.find('')+2:albumPage.find('')]
albumid = albumPage[0:albumPage.find('&')]
print 'Acquiring: ' + albumName + ' album.'
getPhotos(dirName, 'album', albumid, albumName)
temp+=1
def search () : #searches using facebook's s.php, for the current time it only takes the top hit -- POOR search method.
searchUrl = homeUrl[0:-8] + 's.php?q=' + searchTerm.replace(' ','+')
#searchValues = urllib.urlencode({'q' : searchTerm})#,{'init': 'q'}) #BROKEN -- i think urlencode doesn't work properly (sid not returned). Used hack on previous line
searchRequest = urllib2.Request(searchUrl,None,fakeHeader)
searchResponse = urllib2.urlopen(searchRequest)
searchPage = searchResponse.read()
sid = searchPage[searchPage.find('&sid=')+5:-1]
sid = sid[0:sid.find('")')]
searchRequest = urllib2.Request(searchUrl + '&sid=' + sid, None, fakeHeader)
searchResponse = urllib2.urlopen(searchRequest)
searchPage = searchResponse.read()
#print searchPage
searchPage = searchPage[searchPage.find('friend='):-1]
searchUrl = searchPage[searchPage.find('http%3A%2F%2F')+13:-1] #trims search page to everything after the first search
searchid = searchUrl[searchUrl.find('%3Fid%3D')+8:searchUrl.find('%26')] #finds the FB id #
searchUrl = 'http://' + searchUrl[0:searchUrl.find('%2Fprofile.php')] + '/' #finds the url with the home network of the person
return searchUrl , searchid
if __name__== "__main__":
cookieFile = '.cookies.lwp' #creates/uses if created, .cookies.lwp file to store cookie information
cookieJar = cookielib.LWPCookieJar()
if os.path.isfile(cookieFile):
cookieJar.load(cookieFile)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
urllib2.install_opener(opener) #installs cookieJar into urllib2 for automated use in accessing FB
email = raw_input('What is your Facebook Email Address?')
password = getpass.getpass('What is your Facebook password?')
loginValues ={'email' : email, 'pass' : password }
encodedLogin = urllib.urlencode(loginValues)
getRequest = urllib2.Request('https://login.facebook.com/login.php',None, fakeHeader) #Need 2 requests to conform with POST requirements of Facebook
getResponse = urllib2.urlopen(getRequest)
loginRequest = urllib2.Request('https://login.facebook.com/login.php',encodedLogin,fakeHeader)
loginResponse = urllib2.urlopen(loginRequest)
homePage = loginResponse.read()
searchTerm = -99
homeUrl = 'http://www.facebook.com/home.php'
#homeUrl = homePage[homePage.find('\'')+1:homePage.rfind('\'')] #http://__USERS-NETWORK__.facebook.com/home.php?
#print homeUrl
logoutID,fbid = getID(homeUrl)
print 'ID: ' + fbid
while searchTerm is not 1:
searchTerm = raw_input('Who would you like to search for? ( 0 for yourself , 1 to exit, type name of others )')
#wishes to exit
if searchTerm.isdigit() and int(searchTerm) is 1:
break
#searches for oneself
if searchTerm.isdigit() and int(searchTerm) is 0:
shortUrl = homeUrl[0:-8] #Need to re-establish fbid? or keep copy of old?
#name entered to be searched
else:
shortUrl, fbid = search()
#print shortUrl
#print fbid
name = getName()
if dirStyle is 1:
dirName = name + ' ' + fbid
else:
dirName = name
print 'Acquiring Profile pictures of ' + name + '.'
getPhotos(dirName, profile)
print 'Acquiring pictures of ' + name + ' owned by others.'
getPhotos(dirName, allPics)
print 'Acquiring pictures of ' + name + ' owned by them.'
getPhotos(dirName, user)
print 'Acquiring Photo albums owned by ' + name
getPhotoAlbums(dirName)
logout(cookieFile,logoutID)