Best way to scrape this site? - GoFuckYourself.com

deonbell · 05-27-2017, 03:49 AM

Best way to scrape this site?

https://bitinfocharts.com/top-100-ri...addresses.html

I tried to scrape with beautiful soup. I got a connection refused. I may copy site local, then scrape using beautiful soup.

I want to keep an updated list of 10,000 bitcoin addresses. I manually entered the top 600 bitcoin addresses into a text file and uploaded to mysql.

I have my bitcoin collider working now. I decided that instead of searching every bitcoin address in the blockchain I want to search a smaller database locally.

This coding is so much fun.

Code:

#!/usr/bin/env python
# Joric/bitcoin-dev, june 2012, public domain

import hashlib
import time
import ctypes
import ctypes.util
import sys
import MySQLdb

ssl = ctypes.cdll.LoadLibrary (ctypes.util.find_library ('ssl') or 'libeay32')

def check_result (val, func, args):
    if val == 0: raise ValueError 
    else: return ctypes.c_void_p (val)

ssl.EC_KEY_new_by_curve_name.restype = ctypes.c_void_p
ssl.EC_KEY_new_by_curve_name.errcheck = check_result

class KEY:
    def __init__(self):
        NID_secp256k1 = 714
        self.k = ssl.EC_KEY_new_by_curve_name(NID_secp256k1)
        self.compressed = False
        self.POINT_CONVERSION_COMPRESSED = 2
        self.POINT_CONVERSION_UNCOMPRESSED = 4

    def __del__(self):
        if ssl:
            ssl.EC_KEY_free(self.k)
        self.k = None

    def generate(self, secret=None):
        if secret:
            self.prikey = secret
            priv_key = ssl.BN_bin2bn(secret, 32, ssl.BN_new())
            group = ssl.EC_KEY_get0_group(self.k)
            pub_key = ssl.EC_POINT_new(group)
            ctx = ssl.BN_CTX_new()
            ssl.EC_POINT_mul(group, pub_key, priv_key, None, None, ctx)
            ssl.EC_KEY_set_private_key(self.k, priv_key)
            ssl.EC_KEY_set_public_key(self.k, pub_key)
            ssl.EC_POINT_free(pub_key)
            ssl.BN_CTX_free(ctx)
            return self.k
        else:
            return ssl.EC_KEY_generate_key(self.k)

    def get_pubkey(self):
        size = ssl.i2o_ECPublicKey(self.k, 0)
        mb = ctypes.create_string_buffer(size)
        ssl.i2o_ECPublicKey(self.k, ctypes.byref(ctypes.pointer(mb)))
        return mb.raw

    def get_secret(self):
        bn = ssl.EC_KEY_get0_private_key(self.k);
        bytes = (ssl.BN_num_bits(bn) + 7) / 8
        mb = ctypes.create_string_buffer(bytes)
        n = ssl.BN_bn2bin(bn, mb);
        return mb.raw.rjust(32, chr(0))

    def set_compressed(self, compressed):
        self.compressed = compressed
        if compressed:
            form = self.POINT_CONVERSION_COMPRESSED
        else:
            form = self.POINT_CONVERSION_UNCOMPRESSED
        ssl.EC_KEY_set_conv_form(self.k, form)

def dhash(s):
    return hashlib.sha256(hashlib.sha256(s).digest()).digest()

def rhash(s):
    h1 = hashlib.new('ripemd160')
    h1.update(hashlib.sha256(s).digest())
    return h1.digest()

b58_digits = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'

def base58_encode(n):
    l = []
    while n > 0:
        n, r = divmod(n, 58)
        l.insert(0,(b58_digits[r]))
    return ''.join(l)

def base58_decode(s):
    n = 0
    for ch in s:
        n *= 58
        digit = b58_digits.index(ch)
        n += digit
    return n

def base58_encode_padded(s):
    res = base58_encode(int('0x' + s.encode('hex'), 16))
    pad = 0
    for c in s:
        if c == chr(0):
            pad += 1
        else:
            break
    return b58_digits[0] * pad + res

def base58_decode_padded(s):
    pad = 0
    for c in s:
        if c == b58_digits[0]:
            pad += 1
        else:
            break
    h = '%x' % base58_decode(s)
    if len(h) % 2:
        h = '0' + h
    res = h.decode('hex')
    return chr(0) * pad + res

def base58_check_encode(s, version=0):
    vs = chr(version) + s
    check = dhash(vs)[:4]
    return base58_encode_padded(vs + check)

def base58_check_decode(s, version=0):
    k = base58_decode_padded(s)
    v0, data, check0 = k[0], k[1:-4], k[-4:]
    check1 = dhash(v0 + data)[:4]
    if check0 != check1:
        raise BaseException('checksum error')
    if version != ord(v0):
        raise BaseException('version mismatch')
    return data

def gen_eckey(passphrase=None, secret=None, pkey=None, compressed=False, rounds=1, version=0):
    k = KEY()
    if passphrase:
        secret = passphrase.encode('utf8')
        for i in xrange(rounds):
            secret = hashlib.sha256(secret).digest()
    if pkey:
        secret = base58_check_decode(pkey, 128+version)
        compressed = len(secret) == 33
        secret = secret[0:32]
    k.generate(secret)
    k.set_compressed(compressed)
    return k

def get_addr(k,version=0):
    pubkey = k.get_pubkey()
    secret = k.get_secret()
    hash160 = rhash(pubkey)
    addr = base58_check_encode(hash160,version)
    payload = secret
    if k.compressed:
        payload = secret + chr(1)
    pkey = base58_check_encode(payload, 128+version)
    print ("address:---" + addr)
    db = MySQLdb.connect("localhost","root","haha","bitcoindb" )
    cursor = db.cursor()
    sql = "SELECT * FROM bitcointable \
       WHERE address = '%s'" % (addr)
    try:
     # Execute the SQL command
       cursor.execute(sql)
      # Fetch all the rows in a list of lists.
       results = cursor.fetchall()
       for row in results:
         print "we found one!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
         print addr
         print pkey
         winnerfile = open('top100winner.txt', 'w')
         winnerfile.write(pkey + ':' + addr)
         winnerfile.close() 
         exit()
    except:
          print addr
 
    db.close() 
    return addr, pkey

def reencode(pkey,version=0):
    payload = base58_check_decode(pkey,128+version)
    secret = payload[:-1]
    payload = secret + chr(1)
    pkey = base58_check_encode(payload, 128+version)
    print get_addr(gen_eckey(pkey))

def test(otherversion):
    # random compressed
    #print get_addr(gen_eckey(compressed=True,version=otherversion),version=otherversion)

    # uncomment these to create addresses via a different method
    # random uncompressed
    #print get_addr(gen_eckey())
    # by secret
    
    inputfile = open('lastsearch.txt', 'r')
    startdata = inputfile.read()
    inputfile.close() 
    print "starting point"

    counterfile = open('top100counter.txt', 'r')
    counter = counterfile.read()
    counterfile.close() 
   
    inputlove=startdata.strip()   
    inputlove = inputlove.zfill(64)
    inputkeyin = int(inputlove,16)

    startingpoint = int(inputlove,16)
    outcounter = int(counter)
    
    while inputkeyin < startingpoint + 10000: 
        print inputkeyin
        inputkeyhex = hex(inputkeyin)[2:-1]
   #     print inputkeyhex
        get_addr(gen_eckey(secret=inputkeyhex.decode('hex')))
        get_addr(gen_eckey(compressed=True,secret=inputkeyhex.decode('hex')))
        inputkeyin = int(inputkeyhex,16)
        inputkeyin = inputkeyin + 1
        outcounter = outcounter + 1

    outputfile = open('lastsearch.txt', 'w')
    outputfile.write(inputkeyhex)
    outputfile.close() 
    if outcounter > 0:
       outputcounter = open('top100counter.txt', 'w')
       stroutcounter=str(outcounter)
       outputcounter.write(stroutcounter)
       outputcounter.close() 
  
if __name__ == '__main__':
    import optparse
    parser = optparse.OptionParser(usage="%prog [options]")
    parser.add_option("--otherversion", dest="otherversion", default=0,
                    help="Generate address with different version number")
    (options, args) = parser.parse_args()


answeryes = "y"
answercapitalyes = "Y"

test(int(options.otherversion))

Barry-xlovecam · 05-27-2017, 04:05 AM

Use phantomjs with a forged user agent paginate the script

just a punk · 05-27-2017, 04:56 AM

See sig.

freecartoonporn · 05-27-2017, 05:01 AM

php advanced html dom

Paul&John · 05-27-2017, 05:41 AM

Wow somebody is pretty rich
124,178 BTC ($251,612,773 USD)

Barry-xlovecam · 05-27-2017, 05:50 AM

Actually it was this fuckin' easy for my IP
curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses.html">bitcoin.html
this returned the first 100
curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses-2.html">bitcoin-2.html
this returned the next 200

So you have a banned IP or user-agent

deonbell · 05-27-2017, 10:47 AM

Quote:

Originally Posted by Barry-xlovecam

Actually it was this fuckin' easy for my IP
curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses.html">bitcoin.html
this returned the first 100
curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses-2.html">bitcoin-2.html
this returned the next 200

So you have a banned IP or user-agent

Yes, This worked for me too. I will create a program that creates a text file of just the bitcoin addresses to upload to my database.

As thanks for your help. I have found a picture from your favorite tv show.

freecartoonporn · 05-27-2017, 06:44 PM

Quote:

Originally Posted by Paul&John

Wow somebody is pretty rich
124,178 BTC ($251,612,773 USD)

there is always someone richer.

CPA-Rush · 05-27-2017, 07:44 PM

dillfly2000 · 05-27-2017, 07:49 PM

I wish I were a bitcoin

johnnyloadproductions · 05-27-2017, 07:52 PM

Quote:

Originally Posted by Barry-xlovecam

Use phantomjs with a forged user agent paginate the script

What this person said.

What I'd recommend you use selenium and use phantomjs.

The reason is with Python as you have their you can do all your writing, parsing, database work within the script and do whatever you need to do without scripting phantomjs in javascript.

from selenium import webdriver
driver = webdriver.PhantomJS("file this in with a /path/to/phantomjs if not set")
driver.set_window_size(1120, 550)
driver.get("https://duckduckgo.com/")
driver.find_element_by_id('search_form_input_homep age').send_keys("realpython")
driver.find_element_by_id("search_button_homepage" ).click()
print driver.current_url
driver.quit()

The reason why I'd use something like phantomjs or selenium to control firefox is the browser just takes care of it. If you use other libraries with python you'll run into small errors possibly with https or other things.
You can always test using selenium with firefox so you can watch your browser do the work.

Bladewire · 05-27-2017, 07:53 PM

I've seen my posts here scraped onto other forums word for word video for video it's been going on for years. There's clones of me making someone money 👁️👃👁️

johnnyloadproductions · 05-27-2017, 07:54 PM

Quote:

Originally Posted by freecartoonporn

php advanced html dom

Python with selenium web driver, use web driver on phantomjs, parse with beautiful soup.

PHP is a great language (I don't care what anyone says) and works the easiest with most web servers but Python in my experience is one of the best general languages that is fast to complete a task and force people to convention.

I love Python, listen to me!

CPA-Rush · 05-27-2017, 08:23 PM

deonbell · 05-28-2017, 09:30 PM

This is ugly, but works. It extracts bitcoin addresses from the page that has been downloaded. I could do better and add curl to the python script to make faster.

Code:

import sys

searchstring = "https://bitinfocharts.com/bitcoin/address/"

searchfile = open("bitcoin.html", "r")
for line in searchfile:
    if searchstring in line:
   #     bitaddress = line.split('"')

        htmlsplit = line.split(searchstring)
        counter = 1
        trimfile = open('trimfile.txt', 'w')
        while counter < 101: 
            left_text = htmlsplit[counter].partition("\"")[0]
            print left_text
            trimfile.write(left_text.strip()+'\n')
            counter = counter + 1

  #      print bitaddress
        trimfile.close() 
searchfile.close

deonbell · 05-28-2017, 10:28 PM

put the above code in a loop for the files I download.

Just run curl 80 times for each page. Run my file splitter and upload each file to sql.

Barry-xlovecam · 05-29-2017, 06:32 AM

step 1 curl the page and > save
step 2 oneliner parse and save the data

Code:

 sed 's/>/>\n/g' bitcoin2.html|egrep '/bitcoin/address/'|cut -d'/' -f6|cut -d'"' -f1  |less

Code:

$ sed 's/>/>\n/g' bitcoin2.html|egrep '/bitcoin/address/'|cut -d'/' -f6|cut -d'"' -f1  |more
1JCe8z4jJVNXSjohjM4i9Hh813dLCNx2Sy
3Nxwenay9Z8Lc9JBiywExpnEFiLp6Afp8v
3D2oetdNuZUqQHPJmcMDDHYoqkyNVsFk9r
1FeexV6bAHb8ybZjqQMjJrcCrHGW9sb6uF
1HQ3Go3ggs8pFnXuHVHRytPCq5fGG8Hbhx
16ZbpCEyVVdqu8VycWR8thUL2Rd9JnjzHt
1KiVwxEuGBYavyKrxkLncJt2pQ5YUUQX7f
1PnMfRF2enSZnR6JSexxBHuQnxG8Vo5FVK
1AhTjUMztCihiTyA4K6E3QEpobjWLwKhkR
1DiHDQMPFu4p84rkLn6Majj2LCZZZRQUaa
1EBHA1ckUWzNKN7BMfDwGTx6GKEbADUozX
14e7XAZbepQp9MXXzjNG3fNLoAUpaBAXHW
18f1yugoAJuXcHAbsuRVLQC9TezJ6iVRLp
1LdRcdxfbSnmCYYNdeYpUnztiYzVfBEQeC
1EfBMK9q6rGFZazeF7jyNdTgqGYgcDgRE5
16cou7Ht6WjTzuFyDBnht9hmvXytg6XdVT
1MuYkciQTfRsU94ReAe5MiAfUpCrbLBcFR
15CVfJUC1LKn1GKZx6RM5UMbFfnTd8vTT4
18rnfoQgGo1HqvVQaAN4QnxjYE7Sez9eca
3J5KeQSVBUEs3v2vEEkZDBtPLWqLTuZPuD
19Mz2o9RDABT74SA9njZqMtJXKEzj2qUoH
323ENWgPNZdzsm2d6CzEaPTFrvavn1giv5
17hf5H8D6Yc4B7zHEg3orAtKn7Jhme7Adx

Why make things so complex?

>> wallets.csv
then;
mysql>
LOAD DATA LOCAL INFILE

deonbell · 05-29-2017, 11:56 PM

Quote:

Originally Posted by Barry-xlovecam

step 1 curl the page and > save
step 2 oneliner parse and save the data

Code:

 sed 's/>/>\n/g' bitcoin2.html|egrep '/bitcoin/address/'|cut -d'/' -f6|cut -d'"' -f1  |less

Why make things so complex?

>> wallets.csv
then;
mysql>
LOAD DATA LOCAL INFILE

I did not know of sed command. Or I would have. But since I have the code written. I put it in a loop.

Process
1. Use curl to download the first 80 Naming them 1.html, 2.html, 3.html and so on.
2. Run my program that will parse all 80 files.

Code:

import sys

searchstring = "https://bitinfocharts.com/bitcoin/address/"

filecount = 1
while filecount < 81:
   searchfile = open(str(filecount)+".html", "r")
   for line in searchfile:
     if searchstring in line:
   #     bitaddress = line.split('"')

         htmlsplit = line.split(searchstring)
         counter = 1
         trimfile = open(str(filecount)+'.txt', 'w')
         while counter < 101: 
             left_text = htmlsplit[counter].partition("\"")[0]
             print left_text
             trimfile.write(left_text.strip()+'\n')
             counter = counter + 1

         trimfile.close() 
         filecount = filecount + 1
   searchfile.close

3. load data local 1.txt
up arrow and change to 2.txt and on and on.

Thank you for your help my friend.
You are the Kirk to my Khan.

deonbell · 05-30-2017, 12:24 AM

1 mistake in above code. Where I add 1 to filecounter. Line should be moved over to the left.

Kind of odd today. I notice the site is kind of messed up. I want to download the next 40 pages to add to my database. Maybe the site owner noticed me scraping every page up to 80?

I need to find a new source for my database. And maybe I will get use your sed command next.

05-27-2017, 04:56 AM	#3
just a punk So fuckin' bored Industry Role: Join Date: Jun 2003 Posts: 32,383	See sig. __________________ Obey the Cowgod

05-27-2017, 05:01 AM	#4
freecartoonporn Confirmed User Industry Role: Join Date: Jan 2012 Location: NC Posts: 7,683	php advanced html dom __________________ SSD Cloud Server, VPS Server, Simple Cloud Hosting \| DigitalOcean

05-27-2017, 05:41 AM	#5
Paul&John Confirmed User Industry Role: Join Date: Aug 2005 Location: YUROP Posts: 8,601	Wow somebody is pretty rich 124,178 BTC ($251,612,773 USD) __________________ Use coupon 'pauljohn' for a $1 discount at already super cheap NameSilo! Anal Webcams \| Kinky Trans Cams Live \| Hotwife XXX Tube \| Get your Proxies here

05-27-2017, 07:44 PM	#9
CPA-Rush small trip to underworld Industry Role: Join Date: Mar 2012 Location: first gen intel 80386/nintendo-gb/arcade/ps1/internet person Posts: 4,927	__________________ automatic exchange - paxum , bitcoin,pm, payza . daizzzy signbucks caution will black-hat black-hat your traffic ignored forever :zuzana designs {firefox- remove some posters \|skills#1\|skills#2\| email >[email protected] } ツ

05-27-2017, 07:49 PM	#10
dillfly2000 hey Industry Role: Join Date: Mar 2012 Location: with you Posts: 2,209	I wish I were a bitcoin __________________ Chaturbate Affiliate

05-27-2017, 04:05 AM	#2
Barry-xlovecam It's 42 Industry Role: Join Date: Jun 2010 Location: Global Posts: 18,083	Use phantomjs with a forged user agent paginate the script

05-27-2017, 05:50 AM	#6
Barry-xlovecam It's 42 Industry Role: Join Date: Jun 2010 Location: Global Posts: 18,083	Actually it was this fuckin' easy for my IP curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses.html">bitcoin.html this returned the first 100 curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses-2.html">bitcoin-2.html this returned the next 200 So you have a banned IP or user-agent

05-27-2017, 07:53 PM	#12
Bladewire StraightBro Industry Role: Join Date: Aug 2003 Location: Monarch Beach, CA USA Posts: 56,229	I've seen my posts here scraped onto other forums word for word video for video it's been going on for years. There's clones of me making someone money 👁️👃👁️

05-27-2017, 08:23 PM	#14
CPA-Rush small trip to underworld Industry Role: Join Date: Mar 2012 Location: first gen intel 80386/nintendo-gb/arcade/ps1/internet person Posts: 4,927	__________________ automatic exchange - paxum , bitcoin,pm, payza . daizzzy signbucks caution will black-hat black-hat your traffic ignored forever :zuzana designs {firefox- remove some posters \|skills#1\|skills#2\| email >[email protected] } ツ

05-28-2017, 10:28 PM	#16
deonbell Confirmed User Industry Role: Join Date: Sep 2015 Posts: 1,045	put the above code in a loop for the files I download. Just run curl 80 times for each page. Run my file splitter and upload each file to sql. __________________ Fake Naked Celebrity Sex Gallery

05-30-2017, 12:24 AM	#19
deonbell Confirmed User Industry Role: Join Date: Sep 2015 Posts: 1,045	1 mistake in above code. Where I add 1 to filecounter. Line should be moved over to the left. Kind of odd today. I notice the site is kind of messed up. I want to download the next 40 pages to add to my database. Maybe the site owner noticed me scraping every page up to 80? I need to find a new source for my database. And maybe I will get use your sed command next. __________________ Fake Naked Celebrity Sex Gallery