Szerkesztő:BinBot/korrkat.py

Ezt arra tudom használni, hogy korrektúrázandó lapokat keressek értelmes és számomra érdekes kategóriák szerint, kiszűrve, amivel valószínűleg nem érdemes foglalkozni.
#coding: utf-8
"""
This script takes a maintenance category called BASECAT, and lists all the
pages in it by categories to a wikipage called BASEPAGE. Only the non-hidden
categories take part in the game.
If the title or any of the categories of the page or primary supercategories
contains any of the words in stoplist, the page will be thrown away as
uninteresting.
Unlisted_categories will not prevent the article of beeing listed, they just
don't appear in the list and don't count. These are also fragments of title.
If the target page had a previous content, RIP.
(It is good if you want to choose maintenance tasks you are fond of.)
"""

import time, re
import wikipedia as pywikibot
import pagegenerators

BASECAT = u'Kategória:Korrektúrázandó lapok'
BASEPAGE = u'Szerkesztő:Bináris/Korrektúrázandó lapok kategóriánként'
editcomment = u'Korrektúrázandó lapok kategóriánként'

stoplist = [
    u'film',
    u'epizód',
    u'szereplő',
    u'anime',
    u'sorozat', # Így jártak a sorozatgyilkosok is (és tényleg, Dennis Nilsen)
    u'televízió',
    u'video',
    u'játék',
    u'labdarúg',
    u'futball',
]

unlisted_categories = [
    u'n született személyek',
    u'n elhunyt személyek',
    u'ATC ',
    u'Országgyűlési képviselők ('
]

# Initial settings:
t = time.time()
site = pywikibot.getSite()
text = u"'''[[:%s]]''' (kissé megrostálva)\n" % BASECAT
text += u'Készült: ~~~~~\n'
text2 = '\n'
bot = pagegenerators.GeneratorFactory()
gen = bot.getCategoryGen(u':Korrektúrázandó lapok',0)
titles = {} # Dict: keys are pages, values are categories
cats = {} # Dict: keys are categories, values are pages
ini = '' # Initial letter of categories for petter page dissection

# A function to determine if any of the strings in a list occurs in a string
def matches(list, s):
    r = re.compile(ur'(?i)(%s)' % '|'.join(map(re.escape, list)))
    return r.search(s) is not None
# A function to determine if any of the strings in a list occurs in a list of
# category objects (break may be used below if there is no extra loop there)
def matcheslist(list, clist):
    for c in clist:
        if matches(list, c.title()):
            return True
    return False

# Collect pages and categories:
num=0
for page in gen:
    num+=1
    print num
    pywikibot.output(page.title())
    if matches(stoplist, page.title()):
        print 'Kidobtam!' # (Thrown out)
        continue
    l = []
    stop = False # True if we find any stopword in categories.
    for c in page.categories():
        try:
            if matches(stoplist, c.title()) or \
                    matcheslist(stoplist, c.supercategoriesList()):
                print 'Kidobtam!'
                pywikibot.output(c.title())
                # Removing the page from cats:
                for catt in l:
                    cats[catt].remove(page)
                    if cats[catt] == []: # Nothing left in.
                        del cats[catt]
                stop = True
                break
            if c.isHiddenCategory() or matches(unlisted_categories, c.title()):
                # This is examined after stoplist because these are allowed
                # to contain a stopword, too.
                continue
            l.append(c)
            if c in cats:
                cats[c].append(page)
            else:
                cats[c] = [page]
        except pywikibot.NoPage:
            pywikibot.output(c.title() + ' is missing.')
    if not stop:
        titles[page] = l

# Create a list:
text +=u'Listázva: %d lap %d kategóriából ' % (len(titles), len(cats))
for c in sorted(list(cats)):
    if c.title(withNamespace=False)[0] != ini:
        ini = c.title(withNamespace=False)[0]
        text2 += '\n== %s ==\n' % ini
    text2 += '\n==== ' + c.title(asLink=True, textlink=True) + ' ====\n'
    #With three =s it was pretty unreadable.
    articles = cats[c]
    for art in articles:
        text2 += u'# ' + art.title(asLink=True, textlink=True) + \
                 u' (%d kategória)\n' % len(titles[art])
# List uncategorized (or only hiddenly categorized) pages:
uncat = filter(lambda x:titles[x] == [], list(titles))
if uncat:
  text2 += u'\n== Kategórián kívüli cikkek ==\n# ' + \
   '\n# '.join(sorted([pag.title(asLink=True, textlink=True) for pag in uncat]))

# Submit the page:
t =  int(time.time()-t)
text += u'%d perc %d mp alatt.<br />' % (t // 60, t % 60)
text += u"'''Tiltólista:''' " + ', '.join(stoplist)  # Stoplist
text += u"'''Nem listázott kategóriák:''' " + ', '.join(unlisted_categories)
text += text2
pywikibot.output(text)
pywikibot.Page(site, BASEPAGE).put(text, editcomment)