This is the parsing script used on the text files. Like I said the GFY DOM is a mess so the script is customizably a mess to handle it. Sometimes the mods remove a user as guest and that funkafies the results which are handled.
Full of print statements for debugging (I need to move away from that). I need to abstract away more of my code as well.
Code:
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import create_engine
from sqlalchemy import MetaData
from sqlalchemy import Table, Column, Integer, Numeric, String, ForeignKey, text
from sqlalchemy import insert
from sqlalchemy import engine
Base = automap_base()
from bs4 import BeautifulSoup
import os
import re
import time
# Time the script starter
start = time.time()
directory = '/path/to/textfiles'
# file number tracker
i = 1
for file in os.listdir(directory):
print i
i = i + 1
if file.endswith('.txt'):
threadID= year= month= day= hour= minute= join_month= join_year= post_in_thread= post_number = 0
user_name= AMorPM= status= location= message = ""
# try:
f = open(directory + '/' + file, 'r+', )
threadID = file.split('.')[0]
soup = BeautifulSoup(f.read(), 'lxml')
engine = create_engine('mysql+pymysql://user:pass'
'@localhost/GFY_2016')
post_in_thread = 0
thread_title = ""
posts = soup.find_all('table', attrs={'id':re.compile('post')})
for p in posts:
items = BeautifulSoup(str(p), 'lxml')
date = items.find('td', attrs={'class':'thead'})
date_string = BeautifulSoup(str(date)).get_text().strip()
parsed_date = date_string.split('-')
try:
# Gets the month, day, year from the extracted text
month = parsed_date[0]
# print "day: " + parsed_date[1]
day = parsed_date[1]
parsed_date = parsed_date[2].split(',')
year = parsed_date[0]
post_time = parsed_date[1].split(':')
hour = post_time[0]
minute = post_time[1].split(' ')[0]
AMorPM = post_time[1].split(' ')[1]
except:
pass
try:
post_number = items.find('a', attrs={'target':'new'})
test = BeautifulSoup(str(post_number))
post_in_thread = test.get_text()
# Get the username of the individual
user_name = items.find('a', attrs={'class':'bigusername'})
name = BeautifulSoup(str(user_name)).get_text()
user_name = name
# print name
except:
pass
try:
# Get the status of the user, e.g. confirmed or so fucking banned
status = items.find('div', attrs={'class':'smallfont'})
status = BeautifulSoup(str(status)).get_text()
# print status
# Join date
join_date = items.find(string=re.compile("Join Date:"))
join_date = BeautifulSoup(str(join_date)).get_text()
# print join_date
join_month = join_date.split(' ')[2]
join_year = join_date.split(' ')[3]
except:
pass
# Location
try:
location = items.find(string=re.compile("Location:"))
location = BeautifulSoup(str(location)).get_text()
except:
pass
# print "Location: null"
try:
posts = items.find(string=re.compile("Posts:"))
posts = BeautifulSoup(str(posts)).get_text().strip()
posts = posts.split(' ')[1].replace(',','')
post_number = posts
except:
pass
# print "Posts: null"
try:
# print items
# print items.find('div', attrs={'id', re.compile('post_message')})
# print items.find_all(id=re.compile('post_message'))
message = BeautifulSoup(str(items.find_all(id=re.compile('post_message')))).get_text()
message = message.replace('\\n','').replace(']', '').replace('[', '').replace('\\r', '')
# print message
except:
pass
# print "message: null"
# This code creates a new thread entry if the post is determined to be the first one
if test.get_text() == '1':
try:
# Select table here and make new thread title
title_block = items.find('td', attrs={'class','alt1'})
thread_title = BeautifulSoup(str(title_block)).find('div', attrs={'class':'smallfont'})
thread_title = BeautifulSoup(str(title_block)).find('div', attrs={'class':'smallfont'})
thread_title = re.search('(?<=<strong>)(.*?)(?=</st)', str(thread_title))
# print thread_title.group(0)
# print "This is the first post"
metadata = MetaData()
thread = Table('threads', metadata,
Column('title', String),
Column('threadID', String),
Column('title', String),
Column('username', String),
Column('year', Integer),
Column('month', Integer),
Column('day', Integer),
Column('hour', Integer),
Column('minute', Integer),
Column('AMorPM', String),
# Column('post_date', String(20)),
# Column('post_name', String(255), index=True),
# Column('post_url', String(255)),
# Column('post_content', String(20000))
)
metadata.create_all(engine)
# Make sure to add items here that were parsed
ins = insert(thread).values(
threadID=threadID,
title=thread_title.group(0),
username=user_name,
year=year,
month=month,
day=day,
hour=hour,
minute=minute,
AMorPM=AMorPM
# post_name=title,
# post_url=url,
# post_content=string,
)
# insert into database the parsed logic
engine.execute(ins)
# engine.dispose()
# engine = create_engine('mysql+pymysql://user:pass'
# '@localhost/GFY_2016')
except:
pass
try:
# print 'This is trying to insert into posts:'
# Select table here and make new thread title
# title_block = items.find('td', attrs={'class','alt1'})
# thread_title = BeautifulSoup(str(title_block)).find('div', attrs={'class':'smallfont'})
# thread_title = BeautifulSoup(str(title_block)).find('div', attrs={'class':'smallfont'})
# thread_title = re.search('(?<=<strong>)(.*?)(?=</st)', str(thread_title))
# print thread_title.group(0)
# print "This is the first post"
metadata = MetaData()
posts = Table('posts', metadata,
Column('threadID', String),
Column('username', String),
Column('year', Integer),
Column('month', Integer),
Column('day', Integer),
Column('hour', Integer),
Column('minute', Integer),
Column('AMorPM', String),
Column('join_year', Integer),
Column('join_month', String),
Column('post_in_thread', Integer),
Column('postcount', Integer),
Column('message', String)
)
metadata.create_all(engine)
# Make sure to add items here that were parsed
ins = insert(posts).values(
threadID=threadID,
username=user_name,
year=year,
month=month,
day=day,
hour=hour,
minute=minute,
AMorPM=AMorPM,
join_year=join_year,
join_month=join_month,
post_in_thread=post_in_thread,
postcount=post_number,
message=message
)
# insert into database the parsed logic
engine.execute(ins)
except:
pass
# print "\n"
# connection.close()
# except:
engine.dispose()
# engine.close()
# pass
print time.time() - start