2021-04-20 08:58:01 -07:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
import shutil
|
|
|
|
import requests
|
|
|
|
import time
|
|
|
|
import re
|
|
|
|
from selenium import webdriver
|
|
|
|
from selenium.webdriver.firefox.options import Options
|
|
|
|
import os
|
|
|
|
import configparser
|
|
|
|
|
|
|
|
options = Options()
|
|
|
|
options.headless = True
|
|
|
|
#options.headless = False
|
|
|
|
if os.name == 'nt':
|
|
|
|
ff_exec_path = "./geckodriver.exe"
|
|
|
|
elif os.name == 'posix':
|
|
|
|
ff_exec_path = "./geckodriver"
|
|
|
|
driver = webdriver.Firefox(executable_path=ff_exec_path, options=options)
|
|
|
|
|
|
|
|
def getcreds():
|
2021-04-20 11:53:53 -07:00
|
|
|
print("Reading configuration.")
|
2021-04-20 08:58:01 -07:00
|
|
|
configuration = configparser.ConfigParser()
|
|
|
|
configuration.read('sgspider.ini')
|
2021-04-20 11:53:53 -07:00
|
|
|
print("Finished reading configuration.")
|
2021-04-20 08:58:01 -07:00
|
|
|
return configuration
|
|
|
|
|
|
|
|
def login(credentials):
|
2021-04-20 11:53:53 -07:00
|
|
|
print("Loading front page and initiating login")
|
2021-04-20 08:58:01 -07:00
|
|
|
driver.get("https://suicidegirls.com")
|
|
|
|
time.sleep(1)
|
|
|
|
driver.find_element_by_id("login").click()
|
|
|
|
time.sleep(1)
|
|
|
|
user = driver.find_element_by_name("username")
|
|
|
|
password = driver.find_element_by_name("password")
|
|
|
|
# Clear the input fields
|
|
|
|
user.clear()
|
|
|
|
password.clear()
|
|
|
|
user.send_keys(credentials['main']['username'])
|
|
|
|
password.send_keys(credentials['main']['password'])
|
|
|
|
time.sleep(1)
|
|
|
|
driver.find_element_by_xpath("//button[@class = 'button call-to-action']").click()
|
2021-04-20 11:53:53 -07:00
|
|
|
print("Login finished, but unverified")
|
2021-04-20 08:58:01 -07:00
|
|
|
time.sleep(1)
|
|
|
|
|
|
|
|
def getgirls():
|
2021-04-20 11:53:53 -07:00
|
|
|
print("Loading photos page.")
|
2021-04-20 08:58:01 -07:00
|
|
|
#driver.get("https://suicidegirls.com/photos")
|
|
|
|
driver.get("https://www.suicidegirls.com/photos/sg/recent/all/")
|
2021-04-20 11:53:53 -07:00
|
|
|
print("Finished loading photos page.")
|
2021-04-20 08:58:01 -07:00
|
|
|
time.sleep(1)
|
|
|
|
print("Starting to scroll through photos page.. this will take a *REALLY* LONG time!")
|
2021-04-20 10:48:32 -07:00
|
|
|
print("Each '.' in the progress output represents a new page that has been loaded and 'x' is a failure to load the next page.")
|
2021-04-20 08:58:01 -07:00
|
|
|
print("Please be cautious of memory usage!\n\n")
|
|
|
|
print("Progress [", end='', flush=True)
|
|
|
|
done = False
|
|
|
|
cctr = 0
|
|
|
|
albumctr = 0
|
|
|
|
while done == False:
|
|
|
|
albumctr = albumctr + 1
|
|
|
|
try:
|
|
|
|
driver.find_element_by_xpath("//a[@id = 'load-more']").click()
|
|
|
|
print('.', end='', flush=True)
|
|
|
|
cctr = 0
|
|
|
|
except:
|
2021-04-20 10:48:32 -07:00
|
|
|
print('x', end='', flush=True)
|
2021-04-20 08:58:01 -07:00
|
|
|
cctr = cctr + 1
|
2021-04-20 10:48:32 -07:00
|
|
|
time.sleep(10)
|
2021-04-20 08:58:01 -07:00
|
|
|
if cctr >= 10:
|
|
|
|
done = True
|
|
|
|
print("]\n")
|
|
|
|
print("Total albums found: " + str(albumctr))
|
|
|
|
|
2021-04-20 10:48:32 -07:00
|
|
|
print("Collecting the URLs for each album. This will take a LONG time!")
|
|
|
|
|
2021-04-20 08:58:01 -07:00
|
|
|
urls = []
|
|
|
|
elems = driver.find_elements_by_xpath("//a[@href]")
|
|
|
|
for elem in elems:
|
|
|
|
urls.append(elem.get_attribute("href"))
|
|
|
|
|
|
|
|
girls = []
|
|
|
|
for girl in urls:
|
|
|
|
if "https" in girl and "album" in girl and "data-comment" not in girl and "members" not in girl and "mailto" not in girl and "twitter.com" not in girl:
|
|
|
|
if girl not in girls:
|
|
|
|
girls.append(girl)
|
|
|
|
return girls
|
|
|
|
|
|
|
|
def getimgs(girls):
|
2021-04-20 10:48:32 -07:00
|
|
|
print("collecting the URLs for the images. This will take a LONG time.")
|
2021-04-20 11:53:53 -07:00
|
|
|
|
2021-04-20 08:58:01 -07:00
|
|
|
for girl in girls:
|
|
|
|
driver.get(girl)
|
|
|
|
urls = []
|
|
|
|
elems = driver.find_elements_by_xpath("//a[@href]")
|
|
|
|
for elem in elems:
|
|
|
|
urls.append(elem.get_attribute("href"))
|
|
|
|
|
|
|
|
name = girl
|
|
|
|
name = name.replace('https://www.suicidegirls.com/girls/', '')
|
|
|
|
name = re.sub('/album(.*)', '', name)
|
|
|
|
album = girl
|
|
|
|
album = re.sub(name, '', album)
|
|
|
|
album = album.replace('https://www.suicidegirls.com/girls/', '')
|
|
|
|
album = re.sub('/album(.*)[0-9]/', '', album)
|
|
|
|
album = re.sub('/', '', album)
|
|
|
|
for img in urls:
|
|
|
|
if "cloudfront" in img:
|
|
|
|
dlimgs(name, album, img)
|
|
|
|
# If we reach this we have looped through all the albums, so let's clean things up
|
|
|
|
cleanup()
|
|
|
|
|
|
|
|
def dlimgs(girl, album, url):
|
|
|
|
path = os.path.join("./suicidegirls", girl)
|
|
|
|
path = os.path.join(path, album)
|
|
|
|
os.makedirs(path, exist_ok=True)
|
|
|
|
filename = os.path.join(path, re.sub('(.*)/', "", os.path.join(path, url)))
|
|
|
|
print("Looking at: " + str(url))
|
2021-04-20 13:12:29 -07:00
|
|
|
if os.path.exists(filename) == True:
|
2021-04-20 08:58:01 -07:00
|
|
|
print("File: " + str(filename) + " already downloaded, skipping!")
|
|
|
|
return
|
2021-04-20 14:31:22 -07:00
|
|
|
print("File: " + str(filename) + " not downloaded, downloading now!")
|
2021-04-20 08:58:01 -07:00
|
|
|
response = requests.get(url, stream=True)
|
|
|
|
with open(filename, 'wb') as out_file:
|
|
|
|
shutil.copyfileobj(response.raw, out_file)
|
|
|
|
del response
|
|
|
|
|
|
|
|
def cleanup():
|
|
|
|
driver.quit()
|
|
|
|
quit()
|
|
|
|
|
|
|
|
def main():
|
|
|
|
login(getcreds())
|
|
|
|
getimgs(getgirls())
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|