Adsense

Sunday, October 26, 2014

Extract all G.K questions: LokSewa

It's a little Python script that will extract all the questions.
copte the following codes and paste it notepad and save it as extract.py. run it with python support.
python should be installed before running this script.

# -*- coding: utf-8 -*-
import os
import re
import codecs
from bs4 import BeautifulSoup
import mechanize


def pre_database():
    for i in range(0,800): # Increase in range will increase the question, change 800 to any no. like 900.
        br = mechanize.Browser()
        link=br.open("http://www.gk.nepalwebtech.com/") # This script will only work for this site, but can be modified accordingly
        bt = BeautifulSoup(link)
        fnd= bt.find_all("div",{"class":"block"})

# Creating required folders
        os.makedirs('c:\\test\\database\\tmp')
        os.mkdir('c:\\test\\database\\pre_data')

# Creating data.txt file to save all unmanaged questions including input forms of html
        opn= open('c:\\test\\database\\tmp\\data.txt', 'w')
        opn.write(str(fnd))
        opn.close()

# Removing html tags from saved in data.txt and saving it to done.txt file. pure text is saved on done.txt
        soup=BeautifulSoup(open("c:\\test\\database\\tmp\\data.txt"))
        all_txt= ''.join(soup.findAll(text=True))
        encoded=all_txt.encode("utf-8")
        opn= open('c:\\test\\database\\tmp\\done.txt', 'w')
        opn.write(encoded)
        opn.close()

# Removing line breaks of done.txt file and saving it to mad.txt file
        file = open('c:\\test\\database\\tmp\\done.txt', 'r')
        twit = open('c:\\test\\database\\tmp\\mad.txt','w')
        for line in file.readlines():
            if re.search('\S', line):    twit.write(line)
        twit.close()
        file.close()
       
# Removing the firs and last line from mad.txt and saving it to frd.txt       
        twit = open('c:\\test\\database\\tmp\\mad.txt')
        lines= twit.readlines()
        w = open("c:\\test\\database\\tmp\\frd.txt",'w')
        w.writelines([item for item in lines[1:-1]])
        w.close()

# finalizing and compiling database source
        w = open("c:\\test\\database\\tmp\\frd.txt")
        fin=open("c:\\test\\database\\pre_data\\quest_data_" + str(i) +".txt",'a')
        lines= w.readlines()

        for line in lines:
            if line==', \n':
                print 'processing ... ' +line
            else:
                fin.writelines(line)
        fin.close()
        w.close()
        twit.close()

        # deleting tmp files
        os.remove('C:\\test\\database\\tmp\\data.txt')
        os.remove('C:\\test\\database\\tmp\\done.txt')
        os.remove('C:\\test\\database\\tmp\\frd.txt')
        os.remove('C:\\test\\database\\tmp\\mad.txt')


 # Creating database
def post_database():
    fin=open("c:\\test\\database\\pre_data\\quest_data_0.txt",'r')
    lines= fin.readlines()
    num_lines = [l for l in lines if l != '\n']
    filename= raw_input("enter filename : ")
    man=0
    print len(num_lines)
    wait=raw_input('wait')
    for i in range(0,(len(num_lines)-5)) :
        if(i%5)==0 and (len(num_lines)-5)!=0:
            quest = lines[i]
            print quest
            ans1 = lines[i+1]
            print ans1
            ans2 = lines[i+2]
            print ans2
            ans3 = lines[i+3]
            print ans3
            ans4 = lines[i+4]
            print ans4
            corr = 'A'
            cor=corr.upper()
            data = '{"groupDesc":"","ques":""%s","A":"%s","B":"%s","C":"%s","D":"%s","corr":"%s","exp":""},' % (quest,ans1,ans2,ans3,ans4,cor)
            print ' Writing to file..........' + data
            target = open("C:\\test\\database\\tmp\\"+filename+".txt",'a')
            #wait=raw_input('Press Enter for next question ....')
            target.write(data)

    target.close()
    file = open("C:\\test\\database\\tmp\\"+filename+".txt", 'r')
    twit = open("C:\\test\\database\\done_"+filename+".txt",'w')
    twit.close()
    twit = open("C:\\test\\database\\done_"+filename+".txt",'a')
    for line in file:
        line= line.replace('\n','')
        twit.write(line)

    twit.close()
    file.close()

operation=raw_input('Enter 1 to create Post-Database \n Enter 2 to create pre-Database \n Note: Post Database is a final porcedure    = ')
if (operation==1):
    post_database()
else:
    pre_database()

No comments:

Post a Comment