GoFuckYourself.com - Adult Webmaster Forum

GoFuckYourself.com - Adult Webmaster Forum (https://gfy.com/index.php)
-   Fucking Around & Business Discussion (https://gfy.com/forumdisplay.php?f=26)
-   -   Best way to scrape this site? (https://gfy.com/showthread.php?t=1264621)

deonbell 05-27-2017 03:49 AM

Best way to scrape this site?
 
Best way to scrape this site?

https://bitinfocharts.com/top-100-ri...addresses.html

I tried to scrape with beautiful soup. I got a connection refused. I may copy site local, then scrape using beautiful soup.

I want to keep an updated list of 10,000 bitcoin addresses. I manually entered the top 600 bitcoin addresses into a text file and uploaded to mysql.

I have my bitcoin collider working now. I decided that instead of searching every bitcoin address in the blockchain I want to search a smaller database locally.

This coding is so much fun.

Code:

#!/usr/bin/env python
# Joric/bitcoin-dev, june 2012, public domain

import hashlib
import time
import ctypes
import ctypes.util
import sys
import MySQLdb

ssl = ctypes.cdll.LoadLibrary (ctypes.util.find_library ('ssl') or 'libeay32')

def check_result (val, func, args):
    if val == 0: raise ValueError
    else: return ctypes.c_void_p (val)

ssl.EC_KEY_new_by_curve_name.restype = ctypes.c_void_p
ssl.EC_KEY_new_by_curve_name.errcheck = check_result

class KEY:
    def __init__(self):
        NID_secp256k1 = 714
        self.k = ssl.EC_KEY_new_by_curve_name(NID_secp256k1)
        self.compressed = False
        self.POINT_CONVERSION_COMPRESSED = 2
        self.POINT_CONVERSION_UNCOMPRESSED = 4

    def __del__(self):
        if ssl:
            ssl.EC_KEY_free(self.k)
        self.k = None

    def generate(self, secret=None):
        if secret:
            self.prikey = secret
            priv_key = ssl.BN_bin2bn(secret, 32, ssl.BN_new())
            group = ssl.EC_KEY_get0_group(self.k)
            pub_key = ssl.EC_POINT_new(group)
            ctx = ssl.BN_CTX_new()
            ssl.EC_POINT_mul(group, pub_key, priv_key, None, None, ctx)
            ssl.EC_KEY_set_private_key(self.k, priv_key)
            ssl.EC_KEY_set_public_key(self.k, pub_key)
            ssl.EC_POINT_free(pub_key)
            ssl.BN_CTX_free(ctx)
            return self.k
        else:
            return ssl.EC_KEY_generate_key(self.k)

    def get_pubkey(self):
        size = ssl.i2o_ECPublicKey(self.k, 0)
        mb = ctypes.create_string_buffer(size)
        ssl.i2o_ECPublicKey(self.k, ctypes.byref(ctypes.pointer(mb)))
        return mb.raw

    def get_secret(self):
        bn = ssl.EC_KEY_get0_private_key(self.k);
        bytes = (ssl.BN_num_bits(bn) + 7) / 8
        mb = ctypes.create_string_buffer(bytes)
        n = ssl.BN_bn2bin(bn, mb);
        return mb.raw.rjust(32, chr(0))

    def set_compressed(self, compressed):
        self.compressed = compressed
        if compressed:
            form = self.POINT_CONVERSION_COMPRESSED
        else:
            form = self.POINT_CONVERSION_UNCOMPRESSED
        ssl.EC_KEY_set_conv_form(self.k, form)

def dhash(s):
    return hashlib.sha256(hashlib.sha256(s).digest()).digest()

def rhash(s):
    h1 = hashlib.new('ripemd160')
    h1.update(hashlib.sha256(s).digest())
    return h1.digest()

b58_digits = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'

def base58_encode(n):
    l = []
    while n > 0:
        n, r = divmod(n, 58)
        l.insert(0,(b58_digits[r]))
    return ''.join(l)

def base58_decode(s):
    n = 0
    for ch in s:
        n *= 58
        digit = b58_digits.index(ch)
        n += digit
    return n

def base58_encode_padded(s):
    res = base58_encode(int('0x' + s.encode('hex'), 16))
    pad = 0
    for c in s:
        if c == chr(0):
            pad += 1
        else:
            break
    return b58_digits[0] * pad + res

def base58_decode_padded(s):
    pad = 0
    for c in s:
        if c == b58_digits[0]:
            pad += 1
        else:
            break
    h = '%x' % base58_decode(s)
    if len(h) % 2:
        h = '0' + h
    res = h.decode('hex')
    return chr(0) * pad + res

def base58_check_encode(s, version=0):
    vs = chr(version) + s
    check = dhash(vs)[:4]
    return base58_encode_padded(vs + check)

def base58_check_decode(s, version=0):
    k = base58_decode_padded(s)
    v0, data, check0 = k[0], k[1:-4], k[-4:]
    check1 = dhash(v0 + data)[:4]
    if check0 != check1:
        raise BaseException('checksum error')
    if version != ord(v0):
        raise BaseException('version mismatch')
    return data

def gen_eckey(passphrase=None, secret=None, pkey=None, compressed=False, rounds=1, version=0):
    k = KEY()
    if passphrase:
        secret = passphrase.encode('utf8')
        for i in xrange(rounds):
            secret = hashlib.sha256(secret).digest()
    if pkey:
        secret = base58_check_decode(pkey, 128+version)
        compressed = len(secret) == 33
        secret = secret[0:32]
    k.generate(secret)
    k.set_compressed(compressed)
    return k

def get_addr(k,version=0):
    pubkey = k.get_pubkey()
    secret = k.get_secret()
    hash160 = rhash(pubkey)
    addr = base58_check_encode(hash160,version)
    payload = secret
    if k.compressed:
        payload = secret + chr(1)
    pkey = base58_check_encode(payload, 128+version)
    print ("address:---" + addr)
    db = MySQLdb.connect("localhost","root","haha","bitcoindb" )
    cursor = db.cursor()
    sql = "SELECT * FROM bitcointable \
      WHERE address = '%s'" % (addr)
    try:
    # Execute the SQL command
      cursor.execute(sql)
      # Fetch all the rows in a list of lists.
      results = cursor.fetchall()
      for row in results:
        print "we found one!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
        print addr
        print pkey
        winnerfile = open('top100winner.txt', 'w')
        winnerfile.write(pkey + ':' + addr)
        winnerfile.close()
        exit()
    except:
          print addr
 
    db.close()
    return addr, pkey

def reencode(pkey,version=0):
    payload = base58_check_decode(pkey,128+version)
    secret = payload[:-1]
    payload = secret + chr(1)
    pkey = base58_check_encode(payload, 128+version)
    print get_addr(gen_eckey(pkey))

def test(otherversion):
    # random compressed
    #print get_addr(gen_eckey(compressed=True,version=otherversion),version=otherversion)

    # uncomment these to create addresses via a different method
    # random uncompressed
    #print get_addr(gen_eckey())
    # by secret
   
    inputfile = open('lastsearch.txt', 'r')
    startdata = inputfile.read()
    inputfile.close()
    print "starting point"

    counterfile = open('top100counter.txt', 'r')
    counter = counterfile.read()
    counterfile.close()
 
    inputlove=startdata.strip() 
    inputlove = inputlove.zfill(64)
    inputkeyin = int(inputlove,16)

    startingpoint = int(inputlove,16)
    outcounter = int(counter)
   
    while inputkeyin < startingpoint + 10000:
        print inputkeyin
        inputkeyhex = hex(inputkeyin)[2:-1]
  #    print inputkeyhex
        get_addr(gen_eckey(secret=inputkeyhex.decode('hex')))
        get_addr(gen_eckey(compressed=True,secret=inputkeyhex.decode('hex')))
        inputkeyin = int(inputkeyhex,16)
        inputkeyin = inputkeyin + 1
        outcounter = outcounter + 1

    outputfile = open('lastsearch.txt', 'w')
    outputfile.write(inputkeyhex)
    outputfile.close()
    if outcounter > 0:
      outputcounter = open('top100counter.txt', 'w')
      stroutcounter=str(outcounter)
      outputcounter.write(stroutcounter)
      outputcounter.close()
 
if __name__ == '__main__':
    import optparse
    parser = optparse.OptionParser(usage="%prog [options]")
    parser.add_option("--otherversion", dest="otherversion", default=0,
                    help="Generate address with different version number")
    (options, args) = parser.parse_args()


answeryes = "y"
answercapitalyes = "Y"

test(int(options.otherversion))


Barry-xlovecam 05-27-2017 04:05 AM

Use phantomjs with a forged user agent paginate the script

just a punk 05-27-2017 04:56 AM

See sig.

freecartoonporn 05-27-2017 05:01 AM

php advanced html dom

Paul&John 05-27-2017 05:41 AM

Wow somebody is pretty rich
124,178 BTC ($251,612,773 USD)

Barry-xlovecam 05-27-2017 05:50 AM

Actually it was this fuckin' easy for my IP
curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses.html">bitcoin.html
this returned the first 100
curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses-2.html">bitcoin-2.html
this returned the next 200

So you have a banned IP or user-agent

deonbell 05-27-2017 10:47 AM

Quote:

Originally Posted by Barry-xlovecam (Post 21795562)
Actually it was this fuckin' easy for my IP
curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses.html">bitcoin.html
this returned the first 100
curl "https://bitinfocharts.com/top-100-richest-bitcoin-addresses-2.html">bitcoin-2.html
this returned the next 200

So you have a banned IP or user-agent

Yes, This worked for me too. I will create a program that creates a text file of just the bitcoin addresses to upload to my database.

As thanks for your help. I have found a picture from your favorite tv show.
http://i.imgur.com/1NpdOgZ.jpg

freecartoonporn 05-27-2017 06:44 PM

Quote:

Originally Posted by Paul&John (Post 21795550)
Wow somebody is pretty rich
124,178 BTC ($251,612,773 USD)

there is always someone richer.

CPA-Rush 05-27-2017 07:44 PM

https://i.ytimg.com/vi/eRFLJP-N18s/maxresdefault.jpg
http://s2.quickmeme.com/img/6d/6d7e6...597ad048b2.jpg

dillfly2000 05-27-2017 07:49 PM

I wish I were a bitcoin :(

johnnyloadproductions 05-27-2017 07:52 PM

Quote:

Originally Posted by Barry-xlovecam (Post 21795484)
Use phantomjs with a forged user agent paginate the script

What this person said.

What I'd recommend you use selenium and use phantomjs.

The reason is with Python as you have their you can do all your writing, parsing, database work within the script and do whatever you need to do without scripting phantomjs in javascript.

from selenium import webdriver
driver = webdriver.PhantomJS("file this in with a /path/to/phantomjs if not set")
driver.set_window_size(1120, 550)
driver.get("https://duckduckgo.com/")
driver.find_element_by_id('search_form_input_homep age').send_keys("realpython")
driver.find_element_by_id("search_button_homepage" ).click()
print driver.current_url
driver.quit()

The reason why I'd use something like phantomjs or selenium to control firefox is the browser just takes care of it. If you use other libraries with python you'll run into small errors possibly with https or other things.
You can always test using selenium with firefox so you can watch your browser do the work.

Bladewire 05-27-2017 07:53 PM

I've seen my posts here scraped onto other forums word for word video for video it's been going on for years. There's clones of me making someone money 👁️👃👁️

johnnyloadproductions 05-27-2017 07:54 PM

Quote:

Originally Posted by freecartoonporn (Post 21795511)
php advanced html dom

Python with selenium web driver, use web driver on phantomjs, parse with beautiful soup.

PHP is a great language (I don't care what anyone says) and works the easiest with most web servers but Python in my experience is one of the best general languages that is fast to complete a task and force people to convention.

I love Python, listen to me!

CPA-Rush 05-27-2017 08:23 PM

http://i.imgur.com/ay5xF2q.gif
http://cfile235.uf.daum.net/image/25...580FA7DB1726CC
https://68.media.tumblr.com/tumblr_l...7o1_r3_400.gif
https://media2.giphy.com/media/l41Yz...HU6k/giphy.gif
http://img0.joyreactor.cc/pics/post/...B8-1859778.gif
https://thechive.files.wordpress.com...1280.gif?w=600
https://2.bp.blogspot.com/-qeySikO_j...800-h800/8.jpg

deonbell 05-28-2017 09:30 PM

This is ugly, but works. It extracts bitcoin addresses from the page that has been downloaded. I could do better and add curl to the python script to make faster.

Code:

import sys

searchstring = "https://bitinfocharts.com/bitcoin/address/"

searchfile = open("bitcoin.html", "r")
for line in searchfile:
    if searchstring in line:
  #    bitaddress = line.split('"')

        htmlsplit = line.split(searchstring)
        counter = 1
        trimfile = open('trimfile.txt', 'w')
        while counter < 101:
            left_text = htmlsplit[counter].partition("\"")[0]
            print left_text
            trimfile.write(left_text.strip()+'\n')
            counter = counter + 1

  #      print bitaddress
        trimfile.close()
searchfile.close


deonbell 05-28-2017 10:28 PM

put the above code in a loop for the files I download.

Just run curl 80 times for each page. Run my file splitter and upload each file to sql.

http://i.imgur.com/Vh41yUx.png

Barry-xlovecam 05-29-2017 06:32 AM

step 1 curl the page and > save
step 2 oneliner parse and save the data

Code:

sed 's/>/>\n/g' bitcoin2.html|egrep '/bitcoin/address/'|cut -d'/' -f6|cut -d'"' -f1  |less
Code:

$ sed 's/>/>\n/g' bitcoin2.html|egrep '/bitcoin/address/'|cut -d'/' -f6|cut -d'"' -f1  |more
1JCe8z4jJVNXSjohjM4i9Hh813dLCNx2Sy
3Nxwenay9Z8Lc9JBiywExpnEFiLp6Afp8v
3D2oetdNuZUqQHPJmcMDDHYoqkyNVsFk9r
1FeexV6bAHb8ybZjqQMjJrcCrHGW9sb6uF
1HQ3Go3ggs8pFnXuHVHRytPCq5fGG8Hbhx
16ZbpCEyVVdqu8VycWR8thUL2Rd9JnjzHt
1KiVwxEuGBYavyKrxkLncJt2pQ5YUUQX7f
1PnMfRF2enSZnR6JSexxBHuQnxG8Vo5FVK
1AhTjUMztCihiTyA4K6E3QEpobjWLwKhkR
1DiHDQMPFu4p84rkLn6Majj2LCZZZRQUaa
1EBHA1ckUWzNKN7BMfDwGTx6GKEbADUozX
14e7XAZbepQp9MXXzjNG3fNLoAUpaBAXHW
18f1yugoAJuXcHAbsuRVLQC9TezJ6iVRLp
1LdRcdxfbSnmCYYNdeYpUnztiYzVfBEQeC
1EfBMK9q6rGFZazeF7jyNdTgqGYgcDgRE5
16cou7Ht6WjTzuFyDBnht9hmvXytg6XdVT
1MuYkciQTfRsU94ReAe5MiAfUpCrbLBcFR
15CVfJUC1LKn1GKZx6RM5UMbFfnTd8vTT4
18rnfoQgGo1HqvVQaAN4QnxjYE7Sez9eca
3J5KeQSVBUEs3v2vEEkZDBtPLWqLTuZPuD
19Mz2o9RDABT74SA9njZqMtJXKEzj2qUoH
323ENWgPNZdzsm2d6CzEaPTFrvavn1giv5
17hf5H8D6Yc4B7zHEg3orAtKn7Jhme7Adx

Why make things so complex?

>> wallets.csv
then;
mysql>
LOAD DATA LOCAL INFILE

deonbell 05-29-2017 11:56 PM

Quote:

Originally Posted by Barry-xlovecam (Post 21798961)
step 1 curl the page and > save
step 2 oneliner parse and save the data

Code:

sed 's/>/>\n/g' bitcoin2.html|egrep '/bitcoin/address/'|cut -d'/' -f6|cut -d'"' -f1  |less
Why make things so complex?

>> wallets.csv
then;
mysql>
LOAD DATA LOCAL INFILE

I did not know of sed command. Or I would have. But since I have the code written. I put it in a loop.

Process
1. Use curl to download the first 80 Naming them 1.html, 2.html, 3.html and so on.
2. Run my program that will parse all 80 files.
Code:

import sys

searchstring = "https://bitinfocharts.com/bitcoin/address/"

filecount = 1
while filecount < 81:
  searchfile = open(str(filecount)+".html", "r")
  for line in searchfile:
    if searchstring in line:
  #    bitaddress = line.split('"')

        htmlsplit = line.split(searchstring)
        counter = 1
        trimfile = open(str(filecount)+'.txt', 'w')
        while counter < 101:
            left_text = htmlsplit[counter].partition("\"")[0]
            print left_text
            trimfile.write(left_text.strip()+'\n')
            counter = counter + 1

        trimfile.close()
        filecount = filecount + 1
  searchfile.close

3. load data local 1.txt
up arrow and change to 2.txt and on and on.

Thank you for your help my friend.
You are the Kirk to my Khan.
http://i.imgur.com/nnzl3ty.gif

deonbell 05-30-2017 12:24 AM

1 mistake in above code. Where I add 1 to filecounter. Line should be moved over to the left.

Kind of odd today. I notice the site is kind of messed up. I want to download the next 40 pages to add to my database. Maybe the site owner noticed me scraping every page up to 80?

I need to find a new source for my database. And maybe I will get use your sed command next.


All times are GMT -7. The time now is 04:30 PM.

Powered by vBulletin® Version 3.8.8
Copyright ©2000 - 2025, vBulletin Solutions, Inc.
©2000-, AI Media Network Inc