![]() |
![]() |
![]() |
||||
Welcome to the GoFuckYourself.com - Adult Webmaster Forum forums. You are currently viewing our boards as a guest which gives you limited access to view most discussions and access our other features. By joining our free community you will have access to post topics, communicate privately with other members (PM), respond to polls, upload content and access many other special features. Registration is fast, simple and absolutely free so please, join our community today! If you have any problems with the registration process or your account login, please contact us. |
![]() ![]() |
|
Discuss what's fucking going on, and which programs are best and worst. One-time "program" announcements from "established" webmasters are allowed. |
|
Thread Tools |
![]() |
#1 |
Confirmed User
Industry Role:
Join Date: Sep 2015
Posts: 1,045
|
Best way to scrape this site?
Best way to scrape this site?
https://bitinfocharts.com/top-100-ri...addresses.html I tried to scrape with beautiful soup. I got a connection refused. I may copy site local, then scrape using beautiful soup. I want to keep an updated list of 10,000 bitcoin addresses. I manually entered the top 600 bitcoin addresses into a text file and uploaded to mysql. I have my bitcoin collider working now. I decided that instead of searching every bitcoin address in the blockchain I want to search a smaller database locally. This coding is so much fun. Code:
#!/usr/bin/env python # Joric/bitcoin-dev, june 2012, public domain import hashlib import time import ctypes import ctypes.util import sys import MySQLdb ssl = ctypes.cdll.LoadLibrary (ctypes.util.find_library ('ssl') or 'libeay32') def check_result (val, func, args): if val == 0: raise ValueError else: return ctypes.c_void_p (val) ssl.EC_KEY_new_by_curve_name.restype = ctypes.c_void_p ssl.EC_KEY_new_by_curve_name.errcheck = check_result class KEY: def __init__(self): NID_secp256k1 = 714 self.k = ssl.EC_KEY_new_by_curve_name(NID_secp256k1) self.compressed = False self.POINT_CONVERSION_COMPRESSED = 2 self.POINT_CONVERSION_UNCOMPRESSED = 4 def __del__(self): if ssl: ssl.EC_KEY_free(self.k) self.k = None def generate(self, secret=None): if secret: self.prikey = secret priv_key = ssl.BN_bin2bn(secret, 32, ssl.BN_new()) group = ssl.EC_KEY_get0_group(self.k) pub_key = ssl.EC_POINT_new(group) ctx = ssl.BN_CTX_new() ssl.EC_POINT_mul(group, pub_key, priv_key, None, None, ctx) ssl.EC_KEY_set_private_key(self.k, priv_key) ssl.EC_KEY_set_public_key(self.k, pub_key) ssl.EC_POINT_free(pub_key) ssl.BN_CTX_free(ctx) return self.k else: return ssl.EC_KEY_generate_key(self.k) def get_pubkey(self): size = ssl.i2o_ECPublicKey(self.k, 0) mb = ctypes.create_string_buffer(size) ssl.i2o_ECPublicKey(self.k, ctypes.byref(ctypes.pointer(mb))) return mb.raw def get_secret(self): bn = ssl.EC_KEY_get0_private_key(self.k); bytes = (ssl.BN_num_bits(bn) + 7) / 8 mb = ctypes.create_string_buffer(bytes) n = ssl.BN_bn2bin(bn, mb); return mb.raw.rjust(32, chr(0)) def set_compressed(self, compressed): self.compressed = compressed if compressed: form = self.POINT_CONVERSION_COMPRESSED else: form = self.POINT_CONVERSION_UNCOMPRESSED ssl.EC_KEY_set_conv_form(self.k, form) def dhash(s): return hashlib.sha256(hashlib.sha256(s).digest()).digest() def rhash(s): h1 = hashlib.new('ripemd160') h1.update(hashlib.sha256(s).digest()) return h1.digest() b58_digits = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' def base58_encode(n): l = [] while n > 0: n, r = divmod(n, 58) l.insert(0,(b58_digits[r])) return ''.join(l) def base58_decode(s): n = 0 for ch in s: n *= 58 digit = b58_digits.index(ch) n += digit return n def base58_encode_padded(s): res = base58_encode(int('0x' + s.encode('hex'), 16)) pad = 0 for c in s: if c == chr(0): pad += 1 else: break return b58_digits[0] * pad + res def base58_decode_padded(s): pad = 0 for c in s: if c == b58_digits[0]: pad += 1 else: break h = '%x' % base58_decode(s) if len(h) % 2: h = '0' + h res = h.decode('hex') return chr(0) * pad + res def base58_check_encode(s, version=0): vs = chr(version) + s check = dhash(vs)[:4] return base58_encode_padded(vs + check) def base58_check_decode(s, version=0): k = base58_decode_padded(s) v0, data, check0 = k[0], k[1:-4], k[-4:] check1 = dhash(v0 + data)[:4] if check0 != check1: raise BaseException('checksum error') if version != ord(v0): raise BaseException('version mismatch') return data def gen_eckey(passphrase=None, secret=None, pkey=None, compressed=False, rounds=1, version=0): k = KEY() if passphrase: secret = passphrase.encode('utf8') for i in xrange(rounds): secret = hashlib.sha256(secret).digest() if pkey: secret = base58_check_decode(pkey, 128+version) compressed = len(secret) == 33 secret = secret[0:32] k.generate(secret) k.set_compressed(compressed) return k def get_addr(k,version=0): pubkey = k.get_pubkey() secret = k.get_secret() hash160 = rhash(pubkey) addr = base58_check_encode(hash160,version) payload = secret if k.compressed: payload = secret + chr(1) pkey = base58_check_encode(payload, 128+version) print ("address:---" + addr) db = MySQLdb.connect("localhost","root","haha","bitcoindb" ) cursor = db.cursor() sql = "SELECT * FROM bitcointable \ WHERE address = '%s'" % (addr) try: # Execute the SQL command cursor.execute(sql) # Fetch all the rows in a list of lists. results = cursor.fetchall() for row in results: print "we found one!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" print addr print pkey winnerfile = open('top100winner.txt', 'w') winnerfile.write(pkey + ':' + addr) winnerfile.close() exit() except: print addr db.close() return addr, pkey def reencode(pkey,version=0): payload = base58_check_decode(pkey,128+version) secret = payload[:-1] payload = secret + chr(1) pkey = base58_check_encode(payload, 128+version) print get_addr(gen_eckey(pkey)) def test(otherversion): # random compressed #print get_addr(gen_eckey(compressed=True,version=otherversion),version=otherversion) # uncomment these to create addresses via a different method # random uncompressed #print get_addr(gen_eckey()) # by secret inputfile = open('lastsearch.txt', 'r') startdata = inputfile.read() inputfile.close() print "starting point" counterfile = open('top100counter.txt', 'r') counter = counterfile.read() counterfile.close() inputlove=startdata.strip() inputlove = inputlove.zfill(64) inputkeyin = int(inputlove,16) startingpoint = int(inputlove,16) outcounter = int(counter) while inputkeyin < startingpoint + 10000: print inputkeyin inputkeyhex = hex(inputkeyin)[2:-1] # print inputkeyhex get_addr(gen_eckey(secret=inputkeyhex.decode('hex'))) get_addr(gen_eckey(compressed=True,secret=inputkeyhex.decode('hex'))) inputkeyin = int(inputkeyhex,16) inputkeyin = inputkeyin + 1 outcounter = outcounter + 1 outputfile = open('lastsearch.txt', 'w') outputfile.write(inputkeyhex) outputfile.close() if outcounter > 0: outputcounter = open('top100counter.txt', 'w') stroutcounter=str(outcounter) outputcounter.write(stroutcounter) outputcounter.close() if __name__ == '__main__': import optparse parser = optparse.OptionParser(usage="%prog [options]") parser.add_option("--otherversion", dest="otherversion", default=0, help="Generate address with different version number") (options, args) = parser.parse_args() answeryes = "y" answercapitalyes = "Y" test(int(options.otherversion))
__________________
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#2 |
It's 42
Industry Role:
Join Date: Jun 2010
Location: Global
Posts: 18,083
|
Use phantomjs with a forged user agent paginate the script
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#3 |
So fuckin' bored
Industry Role:
Join Date: Jun 2003
Posts: 32,381
|
See sig.
__________________
Obey the Cowgod |
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#4 |
Confirmed User
Industry Role:
Join Date: Jan 2012
Location: NC
Posts: 7,683
|
php advanced html dom
__________________
SSD Cloud Server, VPS Server, Simple Cloud Hosting | DigitalOcean
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#5 |
Confirmed User
Industry Role:
Join Date: Aug 2005
Location: YUROP
Posts: 8,595
|
Wow somebody is pretty rich
124,178 BTC ($251,612,773 USD)
__________________
![]() Anal Webcams | Kinky Trans Cams Live | Hotwife XXX Tube | Get your Proxies here |
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#6 |
It's 42
Industry Role:
Join Date: Jun 2010
Location: Global
Posts: 18,083
|
Actually it was this fuckin' easy for my IP
curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses.html">bitcoin.html this returned the first 100 curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses-2.html">bitcoin-2.html this returned the next 200 So you have a banned IP or user-agent |
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#7 | |
Confirmed User
Industry Role:
Join Date: Sep 2015
Posts: 1,045
|
Quote:
As thanks for your help. I have found a picture from your favorite tv show. ![]()
__________________
|
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#8 |
Confirmed User
Industry Role:
Join Date: Jan 2012
Location: NC
Posts: 7,683
|
there is always someone richer.
__________________
SSD Cloud Server, VPS Server, Simple Cloud Hosting | DigitalOcean
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#9 |
small trip to underworld
Industry Role:
Join Date: Mar 2012
Location: first gen intel 80386/nintendo-gb/arcade/ps1/internet person
Posts: 4,927
|
![]() ![]()
__________________
automatic exchange - paxum , bitcoin,pm, payza . daizzzy signbucks caution will black-hat black-hat your traffic ignored forever :zuzana designs
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#10 |
hey
Industry Role:
Join Date: Mar 2012
Location: with you
Posts: 2,209
|
I wish I were a bitcoin
![]()
__________________
Chaturbate Affiliate |
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#11 | |
Account Shutdown
Industry Role:
Join Date: Oct 2008
Location: Gone
Posts: 3,611
|
Quote:
What I'd recommend you use selenium and use phantomjs. The reason is with Python as you have their you can do all your writing, parsing, database work within the script and do whatever you need to do without scripting phantomjs in javascript. from selenium import webdriver driver = webdriver.PhantomJS("file this in with a /path/to/phantomjs if not set") driver.set_window_size(1120, 550) driver.get("https://duckduckgo.com/") driver.find_element_by_id('search_form_input_homep age').send_keys("realpython") driver.find_element_by_id("search_button_homepage" ).click() print driver.current_url driver.quit() The reason why I'd use something like phantomjs or selenium to control firefox is the browser just takes care of it. If you use other libraries with python you'll run into small errors possibly with https or other things. You can always test using selenium with firefox so you can watch your browser do the work. |
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#12 |
StraightBro
Industry Role:
Join Date: Aug 2003
Location: Monarch Beach, CA USA
Posts: 56,232
|
I've seen my posts here scraped onto other forums word for word video for video it's been going on for years. There's clones of me making someone money 👁️👃👁️
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#13 |
Account Shutdown
Industry Role:
Join Date: Oct 2008
Location: Gone
Posts: 3,611
|
Python with selenium web driver, use web driver on phantomjs, parse with beautiful soup.
PHP is a great language (I don't care what anyone says) and works the easiest with most web servers but Python in my experience is one of the best general languages that is fast to complete a task and force people to convention. I love Python, listen to me! |
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#14 |
small trip to underworld
Industry Role:
Join Date: Mar 2012
Location: first gen intel 80386/nintendo-gb/arcade/ps1/internet person
Posts: 4,927
|
![]() ![]() ![]() ![]() ![]() ![]()
__________________
automatic exchange - paxum , bitcoin,pm, payza . daizzzy signbucks caution will black-hat black-hat your traffic ignored forever :zuzana designs
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#15 |
Confirmed User
Industry Role:
Join Date: Sep 2015
Posts: 1,045
|
This is ugly, but works. It extracts bitcoin addresses from the page that has been downloaded. I could do better and add curl to the python script to make faster.
Code:
import sys searchstring = "https://bitinfocharts.com/bitcoin/address/" searchfile = open("bitcoin.html", "r") for line in searchfile: if searchstring in line: # bitaddress = line.split('"') htmlsplit = line.split(searchstring) counter = 1 trimfile = open('trimfile.txt', 'w') while counter < 101: left_text = htmlsplit[counter].partition("\"")[0] print left_text trimfile.write(left_text.strip()+'\n') counter = counter + 1 # print bitaddress trimfile.close() searchfile.close
__________________
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#16 |
Confirmed User
Industry Role:
Join Date: Sep 2015
Posts: 1,045
|
put the above code in a loop for the files I download.
Just run curl 80 times for each page. Run my file splitter and upload each file to sql. ![]()
__________________
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#17 |
It's 42
Industry Role:
Join Date: Jun 2010
Location: Global
Posts: 18,083
|
step 1 curl the page and > save
step 2 oneliner parse and save the data Code:
sed 's/>/>\n/g' bitcoin2.html|egrep '/bitcoin/address/'|cut -d'/' -f6|cut -d'"' -f1 |less Code:
$ sed 's/>/>\n/g' bitcoin2.html|egrep '/bitcoin/address/'|cut -d'/' -f6|cut -d'"' -f1 |more 1JCe8z4jJVNXSjohjM4i9Hh813dLCNx2Sy 3Nxwenay9Z8Lc9JBiywExpnEFiLp6Afp8v 3D2oetdNuZUqQHPJmcMDDHYoqkyNVsFk9r 1FeexV6bAHb8ybZjqQMjJrcCrHGW9sb6uF 1HQ3Go3ggs8pFnXuHVHRytPCq5fGG8Hbhx 16ZbpCEyVVdqu8VycWR8thUL2Rd9JnjzHt 1KiVwxEuGBYavyKrxkLncJt2pQ5YUUQX7f 1PnMfRF2enSZnR6JSexxBHuQnxG8Vo5FVK 1AhTjUMztCihiTyA4K6E3QEpobjWLwKhkR 1DiHDQMPFu4p84rkLn6Majj2LCZZZRQUaa 1EBHA1ckUWzNKN7BMfDwGTx6GKEbADUozX 14e7XAZbepQp9MXXzjNG3fNLoAUpaBAXHW 18f1yugoAJuXcHAbsuRVLQC9TezJ6iVRLp 1LdRcdxfbSnmCYYNdeYpUnztiYzVfBEQeC 1EfBMK9q6rGFZazeF7jyNdTgqGYgcDgRE5 16cou7Ht6WjTzuFyDBnht9hmvXytg6XdVT 1MuYkciQTfRsU94ReAe5MiAfUpCrbLBcFR 15CVfJUC1LKn1GKZx6RM5UMbFfnTd8vTT4 18rnfoQgGo1HqvVQaAN4QnxjYE7Sez9eca 3J5KeQSVBUEs3v2vEEkZDBtPLWqLTuZPuD 19Mz2o9RDABT74SA9njZqMtJXKEzj2qUoH 323ENWgPNZdzsm2d6CzEaPTFrvavn1giv5 17hf5H8D6Yc4B7zHEg3orAtKn7Jhme7Adx >> wallets.csv then; mysql> LOAD DATA LOCAL INFILE |
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#18 | |
Confirmed User
Industry Role:
Join Date: Sep 2015
Posts: 1,045
|
Quote:
Process 1. Use curl to download the first 80 Naming them 1.html, 2.html, 3.html and so on. 2. Run my program that will parse all 80 files. Code:
import sys searchstring = "https://bitinfocharts.com/bitcoin/address/" filecount = 1 while filecount < 81: searchfile = open(str(filecount)+".html", "r") for line in searchfile: if searchstring in line: # bitaddress = line.split('"') htmlsplit = line.split(searchstring) counter = 1 trimfile = open(str(filecount)+'.txt', 'w') while counter < 101: left_text = htmlsplit[counter].partition("\"")[0] print left_text trimfile.write(left_text.strip()+'\n') counter = counter + 1 trimfile.close() filecount = filecount + 1 searchfile.close up arrow and change to 2.txt and on and on. Thank you for your help my friend. You are the Kirk to my Khan. ![]()
__________________
|
|
![]() |
![]() ![]() ![]() ![]() ![]() |
![]() |
#19 |
Confirmed User
Industry Role:
Join Date: Sep 2015
Posts: 1,045
|
1 mistake in above code. Where I add 1 to filecounter. Line should be moved over to the left.
Kind of odd today. I notice the site is kind of messed up. I want to download the next 40 pages to add to my database. Maybe the site owner noticed me scraping every page up to 80? I need to find a new source for my database. And maybe I will get use your sed command next.
__________________
|
![]() |
![]() ![]() ![]() ![]() ![]() |