sgspider/sgspider.py

#!/usr/bin/env python3
import shutil
import requests
import time
import re
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import os
import configparser

options = Options()
options.headless = True
#options.headless = False
if os.name == 'nt':
    ff_exec_path = "./geckodriver.exe"
elif os.name == 'posix':
    ff_exec_path = "./geckodriver"
driver = webdriver.Firefox(executable_path=ff_exec_path, options=options)

def getcreds():
    print("Reading configuration.")
    configuration = configparser.ConfigParser()
    configuration.read('sgspider.ini')
    print("Finished reading configuration.")
    return configuration

def login(credentials):
    print("Loading front page and initiating login")
    driver.get("https://suicidegirls.com")
    time.sleep(1)
    driver.find_element_by_id("login").click()
    time.sleep(1)
    user = driver.find_element_by_name("username")
    password = driver.find_element_by_name("password")
    # Clear the input fields
    user.clear()
    password.clear()
    user.send_keys(credentials['main']['username'])
    password.send_keys(credentials['main']['password'])
    time.sleep(1)
    driver.find_element_by_xpath("//button[@class = 'button call-to-action']").click()
    print("Login finished, but unverified")
    time.sleep(1)

def getgirls():
    print("Loading photos page.")
    #driver.get("https://suicidegirls.com/photos")
    driver.get("https://www.suicidegirls.com/photos/sg/recent/all/")
    print("Finished loading photos page.")
    time.sleep(1)
    print("Starting to scroll through photos page.. this will take a *REALLY* LONG time!")
    print("Each '.' in the progress output represents a new page that has been loaded and 'x' is a failure to load the next page.")
    print("Please be cautious of memory usage!\n\n")
    print("Progress [", end='', flush=True)
    done = False
    cctr = 0
    albumctr = 0
    while done == False:
        albumctr = albumctr + 1
        try:
            driver.find_element_by_xpath("//a[@id = 'load-more']").click()
            print('.', end='', flush=True)
            cctr = 0
        except: 
            print('x', end='', flush=True)
            cctr = cctr + 1
            time.sleep(10)
            if cctr >= 10:
                done = True
    print("]\n")
    print("Total albums found: " + str(albumctr))

    print("Collecting the URLs for each album. This will take a LONG time!")

    urls = []
    elems = driver.find_elements_by_xpath("//a[@href]")
    for elem in elems:
        urls.append(elem.get_attribute("href"))

    girls = []
    for girl in urls:
        if "https" in girl and "album" in girl and "data-comment" not in girl and "members" not in girl and "mailto" not in girl and "twitter.com" not in girl:
            if girl not in girls:
                girls.append(girl)
    return girls

def getimgs(girls):
    print("collecting the URLs for the images. This will take a LONG time.")

    for girl in girls:
        driver.get(girl)
        urls = []
        elems = driver.find_elements_by_xpath("//a[@href]")
        for elem in elems:
            urls.append(elem.get_attribute("href"))

        name = girl
        name = name.replace('https://www.suicidegirls.com/girls/', '')
        name = re.sub('/album(.*)', '', name)
        album = girl
        album = re.sub(name, '', album)
        album = album.replace('https://www.suicidegirls.com/girls/', '')
        album = re.sub('/album(.*)[0-9]/', '', album)
        album = re.sub('/', '', album)
        for img in urls:
            if "cloudfront" in img:
                dlimgs(name, album, img)
    # If we reach this we have looped through all the albums, so let's clean things up
    cleanup()

def dlimgs(girl, album, url):
    path = os.path.join("./suicidegirls", girl)
    path = os.path.join(path, album)
    os.makedirs(path, exist_ok=True)   
    filename = os.path.join(path, re.sub('(.*)/', "", os.path.join(path, url)))
    print("Looking at: " + str(url))
    if os.path.exists(filename) == True:
        print("File: " + str(filename) + " already downloaded, skipping!")
        return
    print("File: "  + str(filename) + " not downloaded, downloading now!")
    response = requests.get(url, stream=True)
    with open(filename, 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
    del response

def cleanup():
    driver.quit()
    quit()

def main():
    login(getcreds())
    getimgs(getgirls())

if __name__ == '__main__':
    main()
Initial upload. 2021-04-20 08:58:01 -07:00			`#!/usr/bin/env python3`
			`import shutil`
			`import requests`
			`import time`
			`import re`
			`from selenium import webdriver`
			`from selenium.webdriver.firefox.options import Options`
			`import os`
			`import configparser`

			`options = Options()`
			`options.headless = True`
			`#options.headless = False`
			`if os.name == 'nt':`
			`ff_exec_path = "./geckodriver.exe"`
			`elif os.name == 'posix':`
			`ff_exec_path = "./geckodriver"`
			`driver = webdriver.Firefox(executable_path=ff_exec_path, options=options)`

			`def getcreds():`
Added additional time reminders for user. 2021-04-20 11:53:53 -07:00			`print("Reading configuration.")`
Initial upload. 2021-04-20 08:58:01 -07:00			`configuration = configparser.ConfigParser()`
			`configuration.read('sgspider.ini')`
Added additional time reminders for user. 2021-04-20 11:53:53 -07:00			`print("Finished reading configuration.")`
Initial upload. 2021-04-20 08:58:01 -07:00			`return configuration`

			`def login(credentials):`
Added additional time reminders for user. 2021-04-20 11:53:53 -07:00			`print("Loading front page and initiating login")`
Initial upload. 2021-04-20 08:58:01 -07:00			`driver.get("https://suicidegirls.com")`
			`time.sleep(1)`
			`driver.find_element_by_id("login").click()`
			`time.sleep(1)`
			`user = driver.find_element_by_name("username")`
			`password = driver.find_element_by_name("password")`
			`# Clear the input fields`
			`user.clear()`
			`password.clear()`
			`user.send_keys(credentials['main']['username'])`
			`password.send_keys(credentials['main']['password'])`
			`time.sleep(1)`
			`driver.find_element_by_xpath("//button[@class = 'button call-to-action']").click()`
Added additional time reminders for user. 2021-04-20 11:53:53 -07:00			`print("Login finished, but unverified")`
Initial upload. 2021-04-20 08:58:01 -07:00			`time.sleep(1)`

			`def getgirls():`
Added additional time reminders for user. 2021-04-20 11:53:53 -07:00			`print("Loading photos page.")`
Initial upload. 2021-04-20 08:58:01 -07:00			`#driver.get("https://suicidegirls.com/photos")`
			`driver.get("https://www.suicidegirls.com/photos/sg/recent/all/")`
Added additional time reminders for user. 2021-04-20 11:53:53 -07:00			`print("Finished loading photos page.")`
Initial upload. 2021-04-20 08:58:01 -07:00			`time.sleep(1)`
			`print("Starting to scroll through photos page.. this will take a REALLY LONG time!")`
Added in some reminders to the user about waiting times. 2021-04-20 10:48:32 -07:00			`print("Each '.' in the progress output represents a new page that has been loaded and 'x' is a failure to load the next page.")`
Initial upload. 2021-04-20 08:58:01 -07:00			`print("Please be cautious of memory usage!\n\n")`
			`print("Progress [", end='', flush=True)`
			`done = False`
			`cctr = 0`
			`albumctr = 0`
			`while done == False:`
			`albumctr = albumctr + 1`
			`try:`
			`driver.find_element_by_xpath("//a[@id = 'load-more']").click()`
			`print('.', end='', flush=True)`
			`cctr = 0`
			`except:`
Added in some reminders to the user about waiting times. 2021-04-20 10:48:32 -07:00			`print('x', end='', flush=True)`
Initial upload. 2021-04-20 08:58:01 -07:00			`cctr = cctr + 1`
Added in some reminders to the user about waiting times. 2021-04-20 10:48:32 -07:00			`time.sleep(10)`
Initial upload. 2021-04-20 08:58:01 -07:00			`if cctr >= 10:`
			`done = True`
			`print("]\n")`
			`print("Total albums found: " + str(albumctr))`

Added in some reminders to the user about waiting times. 2021-04-20 10:48:32 -07:00			`print("Collecting the URLs for each album. This will take a LONG time!")`

Initial upload. 2021-04-20 08:58:01 -07:00			`urls = []`
			`elems = driver.find_elements_by_xpath("//a[@href]")`
			`for elem in elems:`
			`urls.append(elem.get_attribute("href"))`

			`girls = []`
			`for girl in urls:`
			`if "https" in girl and "album" in girl and "data-comment" not in girl and "members" not in girl and "mailto" not in girl and "twitter.com" not in girl:`
			`if girl not in girls:`
			`girls.append(girl)`
			`return girls`

			`def getimgs(girls):`
Added in some reminders to the user about waiting times. 2021-04-20 10:48:32 -07:00			`print("collecting the URLs for the images. This will take a LONG time.")`
Added additional time reminders for user. 2021-04-20 11:53:53 -07:00
Initial upload. 2021-04-20 08:58:01 -07:00			`for girl in girls:`
			`driver.get(girl)`
			`urls = []`
			`elems = driver.find_elements_by_xpath("//a[@href]")`
			`for elem in elems:`
			`urls.append(elem.get_attribute("href"))`

			`name = girl`
			`name = name.replace('https://www.suicidegirls.com/girls/', '')`
			`name = re.sub('/album(.*)', '', name)`
			`album = girl`
			`album = re.sub(name, '', album)`
			`album = album.replace('https://www.suicidegirls.com/girls/', '')`
			`album = re.sub('/album(.*)[0-9]/', '', album)`
			`album = re.sub('/', '', album)`
			`for img in urls:`
			`if "cloudfront" in img:`
			`dlimgs(name, album, img)`
			`# If we reach this we have looped through all the albums, so let's clean things up`
			`cleanup()`

			`def dlimgs(girl, album, url):`
			`path = os.path.join("./suicidegirls", girl)`
			`path = os.path.join(path, album)`
			`os.makedirs(path, exist_ok=True)`
			`filename = os.path.join(path, re.sub('(.*)/', "", os.path.join(path, url)))`
			`print("Looking at: " + str(url))`
Fixed logic in image downloader. 2021-04-20 13:12:29 -07:00			`if os.path.exists(filename) == True:`
Initial upload. 2021-04-20 08:58:01 -07:00			`print("File: " + str(filename) + " already downloaded, skipping!")`
			`return`
Updated logic in image downloader. 2021-04-20 14:31:22 -07:00			`print("File: " + str(filename) + " not downloaded, downloading now!")`
Initial upload. 2021-04-20 08:58:01 -07:00			`response = requests.get(url, stream=True)`
			`with open(filename, 'wb') as out_file:`
			`shutil.copyfileobj(response.raw, out_file)`
			`del response`

			`def cleanup():`
			`driver.quit()`
			`quit()`

			`def main():`
			`login(getcreds())`
			`getimgs(getgirls())`

			`if __name__ == '__main__':`
			`main()`