User:R. Hillgentleman/CIAworldfactbooklist1.py

出自維基百科,自由嘅百科全書
Jump to navigation Jump to search
import wikipedia
import re
import urllib

site=wikipedia.getSite()


#####TO GET THE LIST OF WORLD FACT BOOKS FROM ANY PAGE
###SET REGEX ################################
"""
SEEK CRAP LIKE THIS:
                <option value="xx.html">World</option>
                <option value="af.html">Afghanistan</option>
                <option value="ax.html">Akrotiri</option>

YIELDS A PAIR OF DICTIONARIES: (list1,list2) = (list of codes, list of countries)
"""


def CIAworldfactbooklist():
  codeword=re.compile(r'(?<=\<option value\=\")[a-z][a-z](?=\.html\"\>)')
  countryword=re.compile(r'(?<=\.html\"\>).+?(?=\</option\>)')

  #OPEN FACTBOOK PAGE AT READ ALL THE CRAP
  countrycode='ee'
  thePage='https://www.cia.gov/library/publications/the-world-factbook/geos/'+countrycode+'.html'
  x=urllib.urlopen(thePage)

  #PAGE OBJECT CREATED, NOW READ
  ciacrap = x.read()

  #NOW SEEK THE countrycodes
  listcode = codeword.findall(ciacrap)
  listcountry= countryword.findall(ciacrap)


  ### MASSAGES THE LISTS INTO DICTONARIES list1 and list2

  #FIRST GET THE OUTPUT STRING
  n=0
  list1={}
  list2={}
  for i in listcode:
    list1[n]=i
    n+=1
  n=0
  for i in listcountry:
    list2[n]=i
    n+=1

  text=''
  n=0
  for i in listcode:
    text+= i + ','+ list2[n] + '\n'
    n+=1

  return (n , list1,list2)  #NUMBER OF CODES, LIST OF CODES (ACTUALLY, A DICTIONARY), LIST OF COUNTRIES (ANOTHER DICTIONARY)

numbercrap, a,b =CIAworldfactbooklist()
for i in range(0,50):
 print a[i],b[i]

wikipedia.stopme()