#rcpagelist.py
#TO GET A LIST OF RECENTCHANGES FROM RC
#EITHER RETURNING IT AS A LIST
#OR PRINT IN ON pagelist.txt
import wikipedia
import re
import urllib
import codecs
### REGEX TO GET PAGENAMES FROM HTML:
#action=history" title="人馬座">歷史</a>
pageword = re.compile( ur'(?<=action\=history" title\=").*?(?="\>歷史\</a>)', flags=re.U)
### OPEN THE SITE
site = wikipedia.getSite()
rcUrl = '/w/index.php?title=special:recentchanges'
def RCpageList():
y = site.getUrl(rcUrl) # GET THE HTML OF THE RC
list = pageword.findall(y) # GET THE LAST OF MATCHES OF pageword IN y
s = set(list) # GENERATE A SET FROM THE LIST
empty = set() # YIELD THE ELEMENTS OF THE SET, ONE BY ONE, UNTIL EMPTY
list1=[]
while s!=empty:
pagename = s.pop()
list1.append(pagename)
return list1
# IF RUN DIRECTLY, PRINT IT ONTO A FILE CALLED pagelist.txt
if __name__ == '__main__':
l = RCpageList()
text='page list from the RC:\n'
for i in l :
text += '[['+ i + ']]\n'
print text
x = raw_input('write file?.....')
file = codecs.open('pagelist.txt','a+','utf-8')
file.write(text)
print'\n......file pagelist.txt written.'
wikipedia.stopme()