# FS - Facebook Spider # V 0.2 # Software created by Sean Colyer, contributions from Joe Kubiak # http://seancolyer.com # sean@seancolyer.com # # Copyright (C) 2008-2009 Sean Colyer # # This package is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This package is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this package; if not, write to the Free Software # Foundation, 51 Franklin Street, Fifth Floor, Boston, MA, 02110-1301 # USA #for some reason creating response.read() variables seemed needed or else accessing the response.read() multiple times tended to lead to problems. #the process is not threaded and runs through the pictures in the order they are on facebook. This is a more accurate representation of a how a human would view the photos. #Regular expressions could probably be used to search through the page more effectively. #FIXME -- FB no longer uses many of the abbreviated subdomains i.e. uva.facebook.com , which should simplify a bit of the code import os, urllib,urllib2,cookielib,getpass #Change these settings below as desired fakeHeader = {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; AOL 9.0; Windows NT 5.1)'} #Use mozilla header to avoid obvious script appearance dirStyle = 1; #1 for "Name FacebookID#" style directory , anything else for just "Name" directory , to avoid multiple people of the same name. rootDir ='' #if not null, needs to end with /, defaults storing pictures to wherever this file is saved. profile = 'profile' user = 'user' allPics = 'all' album = 'album' #acquire users ID based on their homepage def getID (homeUrl): homeReq = urllib2.Request(homeUrl,None,fakeHeader) homeResponse = urllib2.urlopen(homeReq) homePage = homeResponse.read() #New Facebook Style: fbid = homePage[homePage.find('profile.php?id=')+15:-1] fbid = fbid[0:fbid.find('"><')] #I don't like this because I think its possible that someone add to their homepage that would get the first hit. Prefer to search for "ref=name" logoutID = homePage[homePage.find('logout.php?h=')+13:-1] logoutID = logoutID[0:logoutID.find('&')] print logoutID, fbid return logoutID, fbid #cleanly logs out off facebook and saves cookiefile def logout(cookieFile,logoutID): print logoutID logoutBase = 'http://www.facebook.com/logout.php?h=' logoutRequest = urllib2.Request(logoutBase + logoutID,None,fakeHeader) urllib2.urlopen(logoutRequest) cookieJar.save(cookieFile) print 'You are now logged out.' #acquires name from someones profile, either end user or searched user def getName(): profileUrl = shortUrl + 'profile.php?id=' + fbid print profileUrl profileReq = urllib2.Request(profileUrl,None, fakeHeader) profileResponse = urllib2.urlopen(profileReq) profilePage = profileResponse.read() name = profilePage[profilePage.find(''):profilePage.find('/title>')] name = name[18:-1] print name return name def getPhotos(dirName, view, albumid = -3, albumName = None): #facebook uses only slight differences between different styles of photo, so a single method can be created for different types -- 2009/03 I think even closer together now if view is allPics or view is user: picBaseUrl = shortUrl + 'profile.php?id=' + fbid +'&v=photos' #OLD STYLE: picBaseUrl = shortUrl + 'photo_search.php?id=' + fbid + '&view=' + view lcdir = '/General Pictures' if view is album: picBaseUrl = shortUrl + 'album.php?aid=' + albumid + '&id=' + fbid albumName = albumName.replace('/','-') lcdir = '/' + albumName if view is profile: picBaseUrl = shortUrl + 'album.php?profile&id='+ fbid lcdir = '/Profile Pictures' #start the search #print picBaseUrl picBaseReq = urllib2.Request(picBaseUrl,None,fakeHeader) picBaseResponse = urllib2.urlopen(picBaseReq) picBasePage = picBaseResponse.read() #print picBasePage if view is profile or view is album: numPics = picBasePage[picBasePage.find('"summary"><h4>')+14:-1] numPics = numPics[0:numPics.find('<')] numPics = (numPics.split())[-1] if not numPics.isdigit(): #special case if there are fewer than 20, I believe, as there are not multiple pages numPics = picBasePage[picBasePage.find('"summary"><h4>')+14:-1] numPics = numPics[0:numPics.find(' photos')] picPerPage = 20 if view is allPics or view is user: numPics = picBasePage[picBasePage.find('class="caption">')+16:-1] numPics = numPics[0:numPics.find(' photos')] picPerPage = 15 if not numPics.isdigit(): print "No photos under the " + view + " view." return picBasePage = picBasePage[picBasePage.find('UIPhotoGrid_Container'):-1] #Ignore extra pics sent by FB #Loop to get individual pictures temp=0 while temp< int(numPics): print str(temp + 1) + ' / ' + str(numPics) if temp > 0 and temp % picPerPage == 0: #once again, slight variations for different styles of pictures. if view is allPics or view is user: #picBaseUrl = shortUrl + 'photo_search.php?page=' + str(int(temp/20)+1) + '&id=' + fbid + '&view=' + view picBaseUrl = shortUrl + 'profile.php?id=' + fbid + '&v=photos&so=' + str(int(temp)) if view is album: picBaseUrl = shortUrl + 'album.php?aid=' + str(albumid) + '&page=' + str(int(temp/picPerPage)+1) + '&id=' + fbid #print picBaseUrl if view is profile: picBaseUrl = shortUrl + 'album.php?page=' + str(int(temp/picPerPage)+1) + '&aid=-3&id=' + fbid #print picBaseUrl picBaseReq = urllib2.Request(picBaseUrl,None,fakeHeader) picBaseResponse = urllib2.urlopen(picBaseReq) picBasePage = picBaseResponse.read() #print picBaseUrl picBasePage = picBasePage[picBasePage.find('UIPhotoGrid_Container'):-1] if picBasePage.find('photo.php?pid=') is -1: break picBasePage=picBasePage[picBasePage.find('photo.php?pid=')+14:-1] picUrlEnd = picBasePage[0:picBasePage.find('"')] #has picid, subj and id which are now required picid = picUrlEnd[0:picUrlEnd.find('&op')] picUrlEnd = picUrlEnd.replace('&','&') #Get and save photo dirName = rootDir + dirName #set active directory so it includes full path up to the user if not os.path.isfile(dirName + lcdir + '/' + picid[0:8] + '.jpg'): #skips over pictures which have already been saved under this name to this directory, to speed up re-running picUrl = shortUrl + 'photo.php?pid=' + picUrlEnd #print picUrl picRequest = urllib2.Request(picUrl,None,fakeHeader) picResponse = urllib2.urlopen(picRequest) picPage = picResponse.read() #print picUrl picPage = picPage[picPage.find('id="myphotolink"><img src="'):-1] picPage = picPage[picPage.find('http://photos'):-1] picPage = picPage[0:picPage.find('"')] if not os.path.isdir(dirName): #create User Directory if needed os.mkdir(dirName) if not os.path.isdir(dirName + lcdir):#Create user\category Directory if needed os.mkdir(dirName + lcdir) picout = open(dirName + lcdir + '/' + picid[0:8] + '.jpg', "wb") picPageFile = (urllib2.urlopen(urllib2.Request(picPage,None,fakeHeader))).read() picout.write(picPageFile) picout.close() temp+=1 def getPhotoAlbums(dirName): albumUrl = shortUrl + 'photos.php?id=' + fbid + '&ref=pb' albumReq = urllib2.Request(albumUrl,None,fakeHeader) albumResponse = urllib2.urlopen(albumReq) albumPage = albumResponse.read() if albumPage.find('No Photo Albums') is not -1: return numAlbums = albumPage[albumPage.find('"summary">')+10:-1] numAlbums = int(numAlbums[0:numAlbums.find('Photo')]) temp = 0 #print albumUrl while temp < numAlbums: if temp > 0 and temp % 5 == 0: albumUrl = shortUrl + 'photos.php?id=' + fbid + '&s=' + str(temp) albumReq = urllib2.Request(albumUrl,None,fakeHeader) albumResponse = urllib2.urlopen(albumReq) albumPage = albumResponse.read() albumPage = albumPage[albumPage.find('<h2><a href="album.php?aid=')+ 27 :-1] albumName = albumPage[albumPage.find('">')+2:albumPage.find('</a>')] albumid = albumPage[0:albumPage.find('&')] print 'Acquiring: ' + albumName + ' album.' getPhotos(dirName, 'album', albumid, albumName) temp+=1 def search () : #searches using facebook's s.php, for the current time it only takes the top hit -- POOR search method. searchUrl = homeUrl[0:-8] + 's.php?q=' + searchTerm.replace(' ','+') #searchValues = urllib.urlencode({'q' : searchTerm})#,{'init': 'q'}) #BROKEN -- i think urlencode doesn't work properly (sid not returned). Used hack on previous line searchRequest = urllib2.Request(searchUrl,None,fakeHeader) searchResponse = urllib2.urlopen(searchRequest) searchPage = searchResponse.read() sid = searchPage[searchPage.find('&sid=')+5:-1] sid = sid[0:sid.find('")')] searchRequest = urllib2.Request(searchUrl + '&sid=' + sid, None, fakeHeader) searchResponse = urllib2.urlopen(searchRequest) searchPage = searchResponse.read() #print searchPage searchPage = searchPage[searchPage.find('friend='):-1] searchUrl = searchPage[searchPage.find('http%3A%2F%2F')+13:-1] #trims search page to everything after the first search searchid = searchUrl[searchUrl.find('%3Fid%3D')+8:searchUrl.find('%26')] #finds the FB id # searchUrl = 'http://' + searchUrl[0:searchUrl.find('%2Fprofile.php')] + '/' #finds the url with the home network of the person return searchUrl , searchid if __name__== "__main__": cookieFile = '.cookies.lwp' #creates/uses if created, .cookies.lwp file to store cookie information cookieJar = cookielib.LWPCookieJar() if os.path.isfile(cookieFile): cookieJar.load(cookieFile) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) urllib2.install_opener(opener) #installs cookieJar into urllib2 for automated use in accessing FB email = raw_input('What is your Facebook Email Address?') password = getpass.getpass('What is your Facebook password?') loginValues ={'email' : email, 'pass' : password } encodedLogin = urllib.urlencode(loginValues) getRequest = urllib2.Request('https://login.facebook.com/login.php',None, fakeHeader) #Need 2 requests to conform with POST requirements of Facebook getResponse = urllib2.urlopen(getRequest) loginRequest = urllib2.Request('https://login.facebook.com/login.php',encodedLogin,fakeHeader) loginResponse = urllib2.urlopen(loginRequest) homePage = loginResponse.read() searchTerm = -99 homeUrl = 'http://www.facebook.com/home.php' #homeUrl = homePage[homePage.find('\'')+1:homePage.rfind('\'')] #http://__USERS-NETWORK__.facebook.com/home.php? #print homeUrl logoutID,fbid = getID(homeUrl) print 'ID: ' + fbid while searchTerm is not 1: searchTerm = raw_input('Who would you like to search for? ( 0 for yourself , 1 to exit, type name of others )') #wishes to exit if searchTerm.isdigit() and int(searchTerm) is 1: break #searches for oneself if searchTerm.isdigit() and int(searchTerm) is 0: shortUrl = homeUrl[0:-8] #Need to re-establish fbid? or keep copy of old? #name entered to be searched else: shortUrl, fbid = search() #print shortUrl #print fbid name = getName() if dirStyle is 1: dirName = name + ' ' + fbid else: dirName = name print 'Acquiring Profile pictures of ' + name + '.' getPhotos(dirName, profile) print 'Acquiring pictures of ' + name + ' owned by others.' getPhotos(dirName, allPics) print 'Acquiring pictures of ' + name + ' owned by them.' getPhotos(dirName, user) print 'Acquiring Photo albums owned by ' + name getPhotoAlbums(dirName) logout(cookieFile,logoutID)