User:R. Hillgentleman/statNew.py
閱讀設定
#statNew.py
#STAT THE [[special:newpages|]] in yue.wikipedia
import re
import wikipedia
import pagegenerators
### settings
#import string
#numberofpages= string.atoi(raw_input('number of pages to check, ignoring years?'))
#datename='2007-11-01'
datename = raw_input('name of sub-stat-page (date?) ')
shortcut = raw_input('name of short-cut?')
numberofpages = 500
repeat = False #falsing the wikipedia.Site.newpages(repeat)
get_redirect= False # falsing the wikipedia.Site.newpages(get_redirect)
line1=u';捷徑[['+shortcut+']]\n新文統計 ([[user:R. Hillgentleman/statNew.py]]):~~~~\n'
line2=u'{|class=\"wikitable sortable\"'
line3=u'\n!文||字|| <nowiki>{{</nowiki> || <nowiki>[[</nowiki> ||嘅/的||係/是||唔/不||咗/了||'
line4=u'\n'
line6=u'\n|}'
line7=u'\n==註==\n<references/>'
line8=u'\n*不(?!屈|惜|朽|治|測)\n*(?<!目|中|麗)的(?!確|而)\n*(?<!為|國|於)是(?!但|非|為|故)'
startofpage=line1+line2+line3+line4
endofpage=line6+line7+line8
text = startofpage #OUTPUT TEXT
"""
{|class="wikitable sortable"
|-
|d||d||e||g
|-
|e||tr||t||e
|-
|23||34||f||5
|}
"""
############## REGEX ########
braces = re.compile(r'\{\{')
brackets= re.compile(r'\[\[')
ge = re.compile(ur'嘅',flags=re.U)
hai = re.compile(ur'係',flags=re.U)
nm = re.compile(ur'唔',flags=re.U)
zo = re.compile(ur'咗',flags=re.U)
dig = re.compile(ur'(?<!目|中|麗)的(?!確|而)',flags=re.U)
si = re.compile(ur'(?<!為|國|於)是(?!但|非|為|故)',flags=re.U)
bud = re.compile(ur'不(?!屈|惜|朽|治|測)',flags=re.U)
liu = re.compile(ur'(?<!不)了(?!解|結)',flags=re.U)
craplist = [(ge,dig),(hai,si),(nm,bud),(zo,liu)]
##################################### OPEN THE SITE ###########
site=wikipedia.getSite()
## GET LIST OF NEW PAGE FROM wikipedia.Site.newpages(number=10,get_redirect=False,repeat=False)
list = site.newpages(numberofpages,get_redirect,repeat)
"""
Page object,
timestamp (unicode), length (int), an empty unicode string, username
or IP address (str), comment (unicode).
"""
## ANALYSE EACH PAGE
for i,timestamp,length,empty,user,comment in list:
if i.isRedirectPage(): continue
t = i.title()
if t.rfind(u'年')!= -1 : continue #IF PAGENAME CONTAINS THE WORD '年'
y = i.get()
length=len(y) #length FROM THE LIST IS CRAP
line='\n|-\n|[['+ t + ']]||%d'%length +'||' #NEW LINE
### COUNT THE CRAP ##########
#COUNT ALL THE DOUBLE BRACES AND, BY THE WAY, REPLACE THEM BY '' AND PUT IT IN CRAP
crap, mo = braces.subn('',y)
line += '%d'%mo + '||'
#COUNT ALL THE DOUBLE BRACKETS AND, BY THE WAY, REPLACE THEM BY ''
crap, link = brackets.subn('',y)
line += '%d'%link + '||'
#OTHER CRAP
count=0 # RESET THE NON-CANTONESE COUNT
for goodword,badword in craplist:
crap, m = goodword.subn('',y) #COUNTING THE GOOD WORDS
crap, n = badword.subn('',y) #COUNTING THE BAD WORDS
if n!=0:
q=m/n
else:
q=-1
if q==0:
count+=1 # NON-CANTONESE COUNT
line += '%d'%m +'/' + '%d'%n +'.=.'+ '%d'%q + '||'
if count == 2 : line += u'唔似廣東話'
if count == 3 : line += u'應該唔係廣東話'
if count == 4 : line += u'唔係廣東話'
if link ==0: line += u'<br/>要維基化'
#ADD THE LINE TO THE TEXT
text += line
print(line)
#END OF PAGE
text += endofpage + u'[[Category:維基百科統計]]'
## OPEN THE DUMP PAGE
sand = wikipedia.Page(site, ur'Wikipedia:統計/'+datename) # OR ur'wikipedia:\u6C99\u76D2'
sand.put(text, u'新文統計: [[user:R. Hillgentleman/statNew.py]]')
##CREATE THE SHORTCUT
short = wikipedia.Page(site, shortcut) #NAME OF SHORTCUT, INPUT FROM BEGINNING
short.put(u'#REDIRECT [[Wikipedia:統計/'+datename+']]')
wikipedia.stopme()
#########################################
# SOME COMMENTED OUT CRAP
#
#ge = re.compile(ur'嘅') # or ur'\u5605'
#br = re.compile(r'\{\{')
#bl = re.compile(r'\}\}')
#newstr , n = ge.subn('',text) # replace every ur'嘅' by empty string
#newstr1 , n1= br.subn('',newstr)
#newstr2 , n2= bl.subn('',newstr1)
#wikipedia.output( 'the number of of GE in sandbox is: ')
#print n
#print ('numbers of {{,}}in sandbox are:')
#print n1 , n2
#wikipedia.stopme()
############################################