User:R. Hillgentleman/rcpagelist.py

出自維基百科,自由嘅百科全書
Jump to navigation Jump to search
#rcpagelist.py
#TO GET A LIST OF RECENTCHANGES FROM RC
#EITHER RETURNING IT AS A LIST
#OR PRINT IN ON pagelist.txt

import wikipedia
import re
import urllib
import codecs

### REGEX TO GET PAGENAMES FROM HTML:
#action=history" title="人馬座">歷史</a>
pageword = re.compile( ur'(?<=action\=history" title\=").*?(?="\>歷史\</a>)', flags=re.U)

### OPEN THE SITE
site = wikipedia.getSite()
rcUrl = '/w/index.php?title=special:recentchanges'

def RCpageList():
  y = site.getUrl(rcUrl)     # GET THE HTML OF THE RC
  list = pageword.findall(y) # GET THE LAST OF MATCHES OF pageword IN y
  s = set(list)   # GENERATE A SET FROM THE LIST
  empty = set()   # YIELD THE ELEMENTS OF THE SET, ONE BY ONE, UNTIL EMPTY
  list1=[]
  while s!=empty:
    pagename = s.pop()
    list1.append(pagename)
  return list1
# IF RUN DIRECTLY, PRINT IT ONTO A FILE CALLED pagelist.txt
if __name__ == '__main__':
  l = RCpageList()
  text='page list from the RC:\n'
  for i in l :
    text += '[['+ i + ']]\n'
  print text
  x = raw_input('write file?.....')
  file = codecs.open('pagelist.txt','a+','utf-8')
  file.write(text)
  print'\n......file pagelist.txt written.'
  wikipedia.stopme()