View Single Post
Old 04-18-2016, 02:28 PM  
johnnyloadproductions
Account Shutdown
 
Industry Role:
Join Date: Oct 2008
Location: Gone
Posts: 3,611
This is the parsing script used on the text files. Like I said the GFY DOM is a mess so the script is customizably a mess to handle it. Sometimes the mods remove a user as guest and that funkafies the results which are handled.

Full of print statements for debugging (I need to move away from that). I need to abstract away more of my code as well.

The parsing script:

Code:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import create_engine
from sqlalchemy import MetaData
from sqlalchemy import Table, Column, Integer, Numeric, String, ForeignKey, text
from sqlalchemy import insert
from sqlalchemy import engine
Base = automap_base()
from bs4 import BeautifulSoup
import os
import re
import time


# Time the script starter
start = time.time()
directory = '/path/to/textfiles'

# file number tracker
i = 1



for file in os.listdir(directory):
    print i
    i = i + 1
    if file.endswith('.txt'):
        threadID= year= month= day= hour= minute= join_month= join_year= post_in_thread= post_number = 0
        user_name= AMorPM= status= location= message = ""



        # try:
        f = open(directory + '/' + file, 'r+', )
        threadID = file.split('.')[0]
       
        soup = BeautifulSoup(f.read(), 'lxml')
        engine = create_engine('mysql+pymysql://user:pass'
                                       '@localhost/GFY_2016')



        post_in_thread = 0
        thread_title = ""
        posts = soup.find_all('table', attrs={'id':re.compile('post')})
        for p in posts:



            items = BeautifulSoup(str(p), 'lxml')
            date = items.find('td', attrs={'class':'thead'})
            date_string = BeautifulSoup(str(date)).get_text().strip()
            parsed_date = date_string.split('-')

            try:
                # Gets the month, day, year from the extracted text
                month = parsed_date[0]

                # print "day: " + parsed_date[1]
                day = parsed_date[1]

                parsed_date = parsed_date[2].split(',')
                year = parsed_date[0]


                post_time = parsed_date[1].split(':')
                hour = post_time[0]
                minute = post_time[1].split(' ')[0]
                AMorPM = post_time[1].split(' ')[1]

             
            except:
                pass

            try:
                post_number = items.find('a', attrs={'target':'new'})
                test =  BeautifulSoup(str(post_number))
                post_in_thread = test.get_text()



                # Get the username of the individual
                user_name = items.find('a', attrs={'class':'bigusername'})
                name = BeautifulSoup(str(user_name)).get_text()
                user_name = name
                # print name
            except:
                pass

            try:
                # Get the status of the user, e.g. confirmed or so fucking banned
                status = items.find('div', attrs={'class':'smallfont'})
                status = BeautifulSoup(str(status)).get_text()
                # print status


                # Join date
                join_date = items.find(string=re.compile("Join Date:"))
                join_date = BeautifulSoup(str(join_date)).get_text()
                # print join_date
                join_month = join_date.split(' ')[2]
                join_year = join_date.split(' ')[3]

            except:
                pass


            # Location
            try:
                location = items.find(string=re.compile("Location:"))
                location = BeautifulSoup(str(location)).get_text()
            except:
                pass
                # print "Location: null"


            try:
                posts = items.find(string=re.compile("Posts:"))
                posts = BeautifulSoup(str(posts)).get_text().strip()
                posts = posts.split(' ')[1].replace(',','')
                post_number = posts
            except:
                pass
                # print "Posts: null"

            try:
                # print items
                # print items.find('div', attrs={'id', re.compile('post_message')})
                # print items.find_all(id=re.compile('post_message'))
                message = BeautifulSoup(str(items.find_all(id=re.compile('post_message')))).get_text()

                message = message.replace('\\n','').replace(']', '').replace('[', '').replace('\\r', '')
                # print message
            except:
                pass
                # print "message: null"

            # This code creates a new thread entry if the post is determined to be the first one
            if test.get_text() == '1':

                try:
                    # Select table here and make new thread title
                    title_block = items.find('td', attrs={'class','alt1'})
                    thread_title = BeautifulSoup(str(title_block)).find('div', attrs={'class':'smallfont'})
                    thread_title = BeautifulSoup(str(title_block)).find('div', attrs={'class':'smallfont'})
                    thread_title = re.search('(?<=<strong>)(.*?)(?=</st)', str(thread_title))
                    # print thread_title.group(0)
                    # print "This is the first post"
                    metadata = MetaData()
                    thread = Table('threads', metadata,
                        Column('title', String),
                               Column('threadID', String),
                               Column('title', String),
                               Column('username', String),
                               Column('year', Integer),
                               Column('month', Integer),
                               Column('day', Integer),
                               Column('hour', Integer),
                               Column('minute', Integer),
                               Column('AMorPM', String),
                        # Column('post_date', String(20)),
                        # Column('post_name', String(255), index=True),
                        # Column('post_url', String(255)),
                        # Column('post_content', String(20000))
                        )
                    metadata.create_all(engine)

                    # Make sure to add items here that were parsed
                    ins = insert(thread).values(
                        threadID=threadID,
                        title=thread_title.group(0),
                        username=user_name,
                        year=year,
                        month=month,
                        day=day,
                        hour=hour,
                        minute=minute,
                        AMorPM=AMorPM
                        # post_name=title,
                        # post_url=url,
                        # post_content=string,
                    )

                    # insert into database the parsed logic
                    engine.execute(ins)
                    # engine.dispose()
                    # engine = create_engine('mysql+pymysql://user:pass'
                    #                    '@localhost/GFY_2016')
                except:
                    pass

            try:
                # print 'This is trying to insert into posts:'
                # Select table here and make new thread title
                # title_block = items.find('td', attrs={'class','alt1'})
                # thread_title = BeautifulSoup(str(title_block)).find('div', attrs={'class':'smallfont'})
                # thread_title = BeautifulSoup(str(title_block)).find('div', attrs={'class':'smallfont'})
                # thread_title = re.search('(?<=<strong>)(.*?)(?=</st)', str(thread_title))
                # print thread_title.group(0)
                # print "This is the first post"
                metadata = MetaData()
                posts = Table('posts', metadata,
                           Column('threadID', String),
                           Column('username', String),
                           Column('year', Integer),
                           Column('month', Integer),
                           Column('day', Integer),
                           Column('hour', Integer),
                           Column('minute', Integer),
                           Column('AMorPM', String),
                           Column('join_year', Integer),
                           Column('join_month', String),
                           Column('post_in_thread', Integer),
                           Column('postcount', Integer),
                           Column('message', String)

                    )
                metadata.create_all(engine)

                # Make sure to add items here that were parsed
                ins = insert(posts).values(
                    threadID=threadID,
                    username=user_name,
                    year=year,
                    month=month,
                    day=day,
                    hour=hour,
                    minute=minute,
                    AMorPM=AMorPM,
                    join_year=join_year,
                    join_month=join_month,
                    post_in_thread=post_in_thread,
                    postcount=post_number,
                    message=message

                )

                # insert into database the parsed logic
                engine.execute(ins)
            except:
                pass

            # print "\n"
            # connection.close()
        # except:
        engine.dispose()
        # engine.close()
        #     pass

print time.time() - start
johnnyloadproductions is offline   Share thread on Digg Share thread on Twitter Share thread on Reddit Share thread on Facebook Reply With Quote