Indice

Esempi di Python

HowToForge To Wiki Converter

In questo esempio si vede come lavora il parser HTML integrato in Python questo programmino serve per alimentare la presente Wiki con documentazioni varie provenienti da HowToForge Nell'esempio sono state usate solo librerie integrate in Python e la versione usata è la 2.7.x

#!/usr/bin/python
 
# ###############################################################################################
# HowToForge to Wiki converter
# ###############################################################################################
# Notes : Originaly developed in Embarcadero Delphi 2007 ... Ported to python for cross-platform
#         compatibility
# Date    : 20121128
# Author  : Ascanio Pressato
# CoAuthor: Massimo Fioravanti
# Lic.    : Creative Commen
# ###############################################################################################
 
import urllib2
import sys
import re
import os
import argparse 
import textwrap
import base64
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
 
 
# Const
KBeginH1 = '===== '
KEndH1 = ' ====='
KBeginH2 = '==== '
KEndH2 = ' ===='
KBeginH3 = '=== '
KEndH3 = ' ==='
KBeginH4 = '== '
KEndH4 = ' =='
KBeginH5 = '= '
KEndH5 = ' ='
 
KBold = '**'
KItalic = '//'
KUnderline = '__'
KBeginStrike = '<del>'
KEndStrike = '</del>'
KCodeStr = "''"
 
KBeginLink = '[['
KEndLink = ']]'
 
KNumList = '  - '
KSymbolList = '  * '
 
KLine = '----'
 
KBeginKey = '<key>'
KEndKey = '</key>'
 
KBeginBox = '<box>'
KEndBox = '</box>'
 
KBeginFile = '<file>'
KEndFile = '</file>'
 
KBeginCode = '<code>'
KEndCode = '

'

KBeginNote = '

' KEndNote = '

'

KBeginImage = 'kendimage'

KParagraph = r'
'

KSearchBeginTitle = r“(<title>)” KSearchEndTitle = r“(<\/title>)”

KInvalidUrlChar = ' :.,\/+*' KValidUrlChar = .ljust(len(KInvalidUrlChar),'_') KBeginData = “<!– begin content –>” KEndData = “

” KEndData2 = “<!– AddThis Button END –>” KEndData3 = “

OutPutWiki = [] SlClassCSS = [] CodeMode = False IsScript = False IsNote = False IsCmd = False OkTrim = False

KBaseUrl = 'http://www.howtoforge.com/' CreditsString = “” CreditsInfo = [] canWork = False PassCount = 0 PagesList = [] KCreditsHeader = 'Revisioni'

BeginPage = -1 EndPage = -1

KVersion = “2.9 Python” KAuthor = “QXNjYW5pbyBQcmVzc2F0bw” KCoAuthor = “TWFzc2ltbyBGaW9yYXZhbnRp”

# Transcode Table #<p class=“command”></p> =⇒



#<p class=“system”></p> =⇒ Italic #<p></p> =⇒ Paragraph # =⇒ Italic # =⇒ Bold + Italic #<pre></pre> =⇒

#<br> =⇒ Paragraph #<a href=“” >text</a> =⇒ text #ImageComment

# Begin & End Of HTML to Wikize # #

# Sample for Credits Extraction #

Submitted by <a href=“forums/member.php?u=2” title=“View user profile.” rel=“nofollow”>falko</a> (<a href=“forums/private.php?do=newpm&amp;u=2” title=“Contact author.” rel=“nofollow”>Contact Author</a>) (<a href=“forums” title=“Forums.”>Forums</a>) on Sun, 2012-10-21 17:33. :: <a href=“sitemap/control-panels/ispconfig”>ISPConfig</a> | <a href=“sitemap/linux/ubuntu”>Ubuntu</a> | <a href=“sitemap/web-server”>Web Server</a> | <a href=“sitemap/web-server/apache”>Apache</a> | <a href=“sitemap/control-panels”>Control Panels</a>

#

# ############################################################################################### # VCL # ###############################################################################################

def valueOf(List, Name):

  result = ""
  for eName, eValue in List:
      if eName == Name:
          result = eValue
  return result

def SameText(str1, str2):

  result = (str1.lower() == str2.lower())
  return result

def IncludeTrailingPathDelimiter(aPath):

  result = ""
  result = os.path.join(aPath, '')
  return result

def NameSpaceToPath(aNameSpace):

  result = ""
  NS = aNameSpace.split(':')
  for elem in NS:
      result = os.path.join(result, elem.replace(".", "_"))
  return result		

def ForceDirectories(aPath):

  if not os.path.exists(aPath):
      os.makedirs(aPath)

# ############################################################################################### # GENUTIL # ###############################################################################################

def SostituisciAllChar(St, Replaced, ReplacedTo):

  result = St
  for i in range(len(Replaced)):
     result = result.replace(Replaced[i], ReplacedTo[i])
  return result

# ############################################################################################### # PROGRAM # ###############################################################################################

def ResetAllVar ():

  SlClassCSS = []
  CodeMode = False
  IsScript = False
  IsNote   = False
  IsCmd    = False
  OkTrim   = False

def Parse_Params ():

  parser = argparse.ArgumentParser(usage = "%(prog)s [options]", version = "%(prog)s " + KVersion,
                                   formatter_class=argparse.RawTextHelpFormatter,	
								 description="Authors: " + base64.b64decode(KAuthor + '==') + " & " + base64.b64decode(KCoAuthor) + "\nConvert HowToForge.com Artivles in Dokuwiki Articles",
                                 epilog="Example: \n   %(prog)s -u http://www.howtoforge.com/perfect-server-ubuntu-12.04-lts-apache2-bind-dovecot-ispconfig-3 -p7  \n   %(prog)s -u http://www.howtoforge.com/perfect-server-ubuntu-12.04-lts-apache2-bind-dovecot-ispconfig-3 -f1 -t7")
  parser.add_argument("-p", "--pagecount", dest="pagecount", default=1,
                    help="Pages to download for this article [default: %(default)s]")
  parser.add_argument("-f", "--frompage", dest="frompage", 
                    help="Begin Page to download for this article (For Partial Download)")
  parser.add_argument("-t", "--topage", dest="topage", 
                    help="End Page to download for this article (For Partial Download)")
  parser.add_argument("-u", "--url", dest="url", metavar="URL", 
                    help="Url to convert")
  parser.add_argument("-w", "--wikiNS", dest="wikiNS", 
                    help="Wiki NameSpace")
  parser.add_argument("-m", "--mediaNS", dest="mediaNS", 
                    help="Media NameSpace")
  parser.add_argument("-c", "--creditstable", dest="creditstable", action="store_true", default=True,
                      help="Include Credits & Info Table [default: %(default)s]")
  parser.add_argument("-o", "--outfolder", dest="outfolder", metavar="FOLDER",
                      help="Folder to save results")
  parser.add_argument("-n", "--flatnames", dest="useflatnames", 
                    help="Begin Page to download for this article (For Partial Download)")
  options = parser.parse_args()
  if (options.url == None):
    print "\nWarning: How the Hell I can convert something if you don't specify URL !!!!";
  
  if (options.outfolder == None) or (options.url == None):
    parser.print_help()
    exit()
  
  if (options.outfolder == None):
    print "\nWarning: OutPut Folder not specified"; exit();

  return options

def GetTitle(Html):

  result = ""
  match = re.search(KSearchBeginTitle, Html)
  beginTitle = match.start(1)
  match = re.search(KSearchEndTitle, Html)
  endTitle = match.start(1)
  result = Html[beginTitle + 7:endTitle]
  result = result[:result.find('|')-1]
  return result
  

def CleanHTML(Html, PassNo):

  result = ""
  if PassNo == 1:
     Title = GetTitle(Html)
  else:
     Title = ""
  match = re.search(r"[^a-zA-Z](" + KBeginData + ")[^a-zA-Z]", Html)
  temp = Html[match.start(1):]
  if PassNo == 1:
     temp = temp[temp.replace('<p>', 'XXX', 1).find('<p>'):]
  else:
     temp = temp[temp.replace('</div>', 'XXX', 2).find('</div>')+5:]
  if PassNo == 1:
     result = "<html><body><h1>" + Title +"</h1><div class=\"autoinserted\">" +  temp
  else:
     result = "<html><body><div class=\"autoinserted\">" +  temp
  match = re.search(r"(" + KEndData3 + ")", result)
  result = result[:match.start(1)] + "</body></html>"
  return result

def ExtractFileNameFromUrl(aUrl):

  result = aUrl.split('/')[-1]
  return result

def GetFlatUrlName(aUrl):

  result = ''
  UrlTokens = aUrl.split('/')
  del UrlTokens[0]
  del UrlTokens[0]
  FileName = UrlTokens.pop()
  for Token in UrlTokens:
  result = result + Token.replace('.', '_') + '_'
  result = result + FileName
  return result

def GetPageOutputDir(aWikiNameSpace, aOutPutFolder):

  result = ""
  if (aWikiNameSpace != None):
      result = NameSpaceToPath(aWikiNameSpace)
  result = SostituisciAllChar(result, KInvalidUrlChar, KValidUrlChar)
  result = IncludeTrailingPathDelimiter(IncludeTrailingPathDelimiter(aOutPutFolder) + result)
  return result

def GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder):

  NormalizedNameSpace = ""
  if (aWikiMediaNameSpace != None):
     NormalizedNameSpace = NameSpaceToPath(aWikiMediaNameSpace)
  result = GetPageOutputDir(aWikiNameSpace, aOutPutFolder)
  result = IncludeTrailingPathDelimiter(IncludeTrailingPathDelimiter(result) + NormalizedNameSpace)
  return result

def GetImageName(aImgUrl, aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder, aUseFlatUrls):

  if aUseFlatUrls:
     result = GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder) + GetFlatUrlName(aImgUrl)
  else:
     result = GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder) + ExtractFileNameFromUrl(aImgUrl)
  return result

def combOutput(aFile):

  fd = open(aFile)
  contents = fd.readlines()
  fd.close()
  new_contents = []
  for line in contents:
      # Strip whitespace, should leave nothing if empty line was just "\n"
      if not line.strip():
         continue
      # We got something, save it
      else:
         new_contents.append(line)
  print "Comb File: " + aFile
  fo = file(aFile, 'w')
  for x in "".join(new_contents):
    fo.write(x)
  fo.close()

def SaveToFile(WikiText, aWikiNameSpace, aOutPutFolder, aUrl):

  MyDir = GetPageOutputDir(aWikiNameSpace, aOutPutFolder)
  ForceDirectories(MyDir)
  OutFileName = MyDir + GetFlatUrlName(aUrl).replace('.', '_') + '.txt'
  print "Saving File: " + OutFileName
  fo = file(OutFileName, 'w')
  for x in WikiText:
    fo.write(x)
  fo.close()
  combOutput(OutFileName)

def DownloadImage(Url, SaveName):

  downloaddir = os.path.dirname(SaveName)
  ForceDirectories(downloaddir)
  result = True
  webFile = urllib2.urlopen(Url)
  data = webFile.read()
  try:
     fileName = response.info()['content-disposition'].split('filename="')[1].split('"')[0]
  except:
     fileName = ExtractFileNameFromUrl(Url)
  localFile = os.path.join(downloaddir, fileName.replace('%20','_'))
  print "Download Image: ", localFile
  with open(localFile, "wb") as image:
       image.write(data)
       image.close()
  webFile.close()
  return result

def CustomAdjustLine(aLine):

  result = aLine
  result = aLine.replace("<vhost>", "[vhost]").replace("  //", "//").replace('<--', '<-').replace('  \\', '\\')
  return result

def CustomAdjustments(aData):

  print "Custom Adjustments begin"
  result = CustomAdjustLine(''.join(aData))
  print "Custom Adjustments end"
  return result

class MyCreditsParser(HTMLParser):

  def handle_starttag(self, tag, attrs):
      global CreditsString
      global canWork
      if SameText(tag, 'div') and SameText(valueOf(attrs, 'class'), 'info'): 
         canWork = True
      if SameText(tag, 'a') and SameText(valueOf(attrs, 'title'), 'View user profile.'): 
         CreditsString = CreditsString + '^ Autore | [[' + KBaseUrl + valueOf(attrs, 'href') + '|'       
  def handle_endtag(self, tag):
      global CreditsString
      global canWork
      global PassCount
      if SameText(tag, 'div'):
         canWork = False
      if SameText(tag, 'a'): 
         if (canWork) and (PassCount <1):
            CreditsString = CreditsString + ']] | \n'
            PassCount = PassCount + 1
  def handle_data(self, data):
      global CreditsString
      global canWork
      if (canWork):
         if (data.strip() != 'Submitted by') and (data.strip() != '(') and (data.strip() != ')') and (data.strip() != 'Forums') and (data.strip() !=') (') and (data.strip() !='Contact Author'):
            match = re.search("(on (Mon|Tue|Wed|Thu|Fri|Sat|Sun), \d{4}-\d{2}-\d{2})", data)
            if match and (match.groups() > 0):
               CreditsString = CreditsString + '^ Ultima Modifica | ' + data[match.start(0)+3:match.end(0)].replace(',', '') + ' | \n'
               canWork = False
            else:
               CreditsString = CreditsString + data.strip()

class MyHTMLParser(HTMLParser):

  def handle_starttag(self, tag, attrs):
     global SlClassCSS
     global CodeMode
     global IsScript
     global OutPutWiki
     global IsNote
     global OkTrim
     if SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'command'): 
          OutPutWiki.append('\n' + KBeginCode + '\n')
          CodeMode = True
          OkTrim = True
     elif SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'system'):
          OutPutWiki.append(KParagraph + KItalic)
          OkTrim = True
     elif SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'highlight'):
          OutPutWiki.append(KParagraph + KBold + KItalic + KCodeStr)
          OkTrim = True
     elif SameText(tag, 'p') and (valueOf(attrs, 'class') == ""):
          OutPutWiki.append('\n' + KParagraph)
     elif SameText(tag, 'div') and SameText(valueOf(attrs, 'class'), 'sponsor'):
          OutPutWiki.append('\n' + KBeginNote + '\n')
          IsNote = True
     elif SameText(tag, 'h1'):
          OutPutWiki.append('\n' + KBeginH1)
     elif SameText(tag, 'h2'):
          OutPutWiki.append('\n' + KBeginH2)
     elif SameText(tag, 'h3'):
          OutPutWiki.append('\n' + KBeginH3)
     elif SameText(tag, 'h4'):
          if IsNote:
             OutPutWiki.append('\n' + KBold)
          else:
             OutPutWiki.append('\n' + KBeginH4)
     elif SameText(tag, 'h5'):
          OutPutWiki.append('\n' + KBeginH5)
     elif SameText(tag, 'span') and SameText(valueOf(attrs, 'class'), 'system'):
          OutPutWiki.append(KItalic + KCodeStr)
     elif SameText(tag, 'span') and SameText(valueOf(attrs, 'class'), 'highlight'):
          OutPutWiki.append(KBold + KItalic + KCodeStr)
     elif SameText(tag, 'li'):
          OutPutWiki.append('\n' + KSymbolList)
     elif SameText(tag, 'br'):
          if CodeMode:
             OutPutWiki.append('\n')
          else:
             OutPutWiki.append(KParagraph + '\n')
     elif SameText(tag, 'b') or SameText(tag, 'strong'):
         OutPutWiki.append(KBold)
     elif SameText(tag, 'i') or SameText(tag, 'em'):
         OutPutWiki.append(KItalic)
     elif SameText(tag, 'pre'):
         OutPutWiki.append('\n' + KBeginFile + '\n')
         CodeMode = True
     elif SameText(tag, 'script'):
         IsScript = True
     elif SameText(tag, 'noscript'):
         IsScript = True
     elif SameText(tag, 'a') and not SameText(valueOf(attrs, 'class'), 'thickbox'):
         OutPutWiki.append(KBeginLink + valueOf(attrs, 'href') + '|')      
     elif SameText(tag, 'img'):
         if not SameText(valueOf(attrs, 'alt'), 'Click to enlarge'):
            if DownloadImage(valueOf(attrs, 'src'),  GetImageName(valueOf(attrs, 'src'), MyOptions.mediaNS, MyOptions.wikiNS, MyOptions.outfolder, MyOptions.useflatnames)):
               MyNS = MyOptions.mediaNS
               if MyNS == None:
                  MyNS = ""
               else:
                  MyNS = ":" + MyNS + ":"
               if MyOptions.useflatnames:
                  ImgWikiTag = '\n' +  KBeginImage + MyNS + GetFlatUrlName(valueOf(attrs, 'src')) + '|' + KEndImage + '\n'
                  OutPutWiki.append(ImgWikiTag)
               else:
                  ImgWikiTag = '\n' +  KBeginImage + MyNS + ExtractFileNameFromUrl(valueOf(attrs, 'src')) + '|' + KEndImage + '\n'
                  OutPutWiki.append(ImgWikiTag)
            else:
               OutPutWiki.append('\n' +  ' #IMG:' + valueOf(attrs, 'src') + ' (Failed to Download)' + '\n')
     if (not SameText(tag, 'br')) and (not SameText(tag, 'img')):
        SlClassCSS.insert(0, (tag, valueOf(attrs, 'class')));
  def handle_endtag(self, tag):
     global SlClassCSS
     global CodeMode
     global IsScript
     global OutPutWiki
     global IsNote
     global OkTrim
     if SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'command'): 
          OutPutWiki.append('\n' + KEndCode + '\n')
          CodeMode = False
          OkTrim = False
     elif SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'highlight'):
          OutPutWiki.append(KCodeStr + KItalic + KBold + KParagraph)
          OkTrim = False
     elif SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'system'):
          OutPutWiki.append(KItalic + KParagraph)
          OkTrim = False
     elif SameText(tag, 'p'):
          OutPutWiki.append(KParagraph + '\n')
     elif SameText(tag, 'div') and SameText(SlClassCSS[0][1], 'sponsor'):
          OutPutWiki.append('\n' + KEndNote + '\n')
          IsNote = False
     elif SameText(tag, 'h1'):
          OutPutWiki.append(KEndH1 +'\n')
     elif SameText(tag, 'h2'):
          OutPutWiki.append(KEndH2 +'\n')
     elif SameText(tag, 'h3'):
          OutPutWiki.append(KEndH3 +'\n')
     elif SameText(tag, 'h4'):
          if IsNote:
             OutPutWiki.append(KBold + '\n' + '----' + '\n')
          else:
             OutPutWiki.append(KEndH4 +'\n')
     elif SameText(tag, 'h5'):
          OutPutWiki.append(KEndH5 +'\n')
     elif SameText(tag, 'span') and SameText(SlClassCSS[0][1], 'system'):
          OutPutWiki.append(KCodeStr + KItalic)
     elif SameText(tag, 'span') and SameText(SlClassCSS[0][1], 'highlight'):
          OutPutWiki.append(KCodeStr + KItalic + KBold)
     elif SameText(tag, 'li'):
          OutPutWiki.append('\n')
     elif SameText(tag, 'b') or SameText(tag, 'strong'):
          OutPutWiki.append(KBold + ' ')
     elif SameText(tag, 'i') or SameText(tag, 'em'):
          OutPutWiki.append(KItalic)
     elif SameText(tag, 'pre'):
          OutPutWiki.append('\n' + KEndFile + '\n')
          CodeMode = False
     elif SameText(tag, 'script'):
          IsScript = False
     elif SameText(tag, 'noscript'):
          IsScript = False
     elif SameText(tag, 'a') and not SameText(SlClassCSS[0][1], 'thickbox'):
          OutPutWiki.append(KEndLink + ' ')
     if (not SameText(tag, 'br')) and (not SameText(tag, 'img')):
         if SameText(tag, SlClassCSS[0][0]):
            del SlClassCSS[0]
  def handle_entityref(self, name):
      if name == 'gt':
         OutPutWiki.append('>')
      elif name == 'lt':
         OutPutWiki.append('<')
      else:
         OutPutWiki.append(unichr(name2codepoint[name]).encode('utf8'))
  def handle_data(self, data):
      global CodeMode
      global IsScript
      global OkTrim
      if (not IsScript): 
         if (CodeMode) or (OkTrim): 
            OutPutWiki.append(data.strip())
         else: 
            OutPutWiki.append(data)

def getbaseurl(aUrl):

  result = aUrl
  match = re.search(r"(-p[0-9])", aUrl)
  try:
     EndUrl = match.start(1)
  except:
   EndUrl = len(aUrl)
  result = aUrl[:EndUrl]
  return result

def getutlforpage(aBaseUrl, aPageNo):

  result = getbaseurl(aBaseUrl)
  if aPageNo != 1:
     result = result + "-p" + str(aPageNo)
  return result

def getCredits(aCreditsTable):

  result = KBeginH2 + KCreditsHeader + KEndH2 + "\n\n"
  result = result + "^  " + KCreditsHeader +"  ^^ \n" + aCreditsTable + "^  Pagine  ^^ \n"
  for Page in PagesList:
      result = result + "| [[" + Page + "]] || \n" 
  return result

def ConvertPage(aUrl, aPassNo):

  global CreditsInfo
  print "Converting page: ", aUrl
# Track Pages
  PagesList.append(aUrl)
  # Reset Vars
  ResetAllVar()
  # Download HTML
  usock = urllib2.urlopen(aUrl)
  data = usock.read()
  usock.close()
# Prima di eliminare tutti i dati dei credits dall'html ...
  if CreditsInfo == []:
     CreditsParser = MyCreditsParser()
     CreditsParser.feed(data)
# Genero la tabella delle Revisioni / Credits
  CreditsInfo = []
  CreditsInfo.append(getCredits(CreditsString))
  # Clean HTML
  MyHtml = CleanHTML(data, aPassNo)
  #SaveToFile(MyHtml, MyOptions.wikiNS, MyOptions.outfolder, aUrl + "_" + str(aPassNo))
  # Parse HTML
  parser = MyHTMLParser()
  parser.feed(MyHtml)

def ConvertSinglePage(aUrl):

  global OutPutWiki
  ConvertPage(aUrl, 1)
  # Conversioni "speciali"
  OutPutWiki = CustomAdjustments(OutPutWiki)
# Aggiungo la CreditsTable in fondo all'articolo
  if MyOptions.creditstable:
     OutPutWiki.append('\n')
     OutPutWiki.append(''.join(CreditsInfo))	   
  # Save to File
  SaveToFile(OutPutWiki, MyOptions.wikiNS, MyOptions.outfolder, aUrl)

def ConvertMultiPage(aUrl, aBeginPage, aEndPage):

  global OutPutWiki
  for idx in range(aBeginPage, aEndPage + 1):
      ConvertPage(getutlforpage(MyOptions.url, idx), idx)
  # Conversioni "speciali"
  OutPutWiki = CustomAdjustments(OutPutWiki)
# Aggiungo la CreditsTable in fondo all'articolo
  if MyOptions.creditstable:
     OutPutWiki = OutPutWiki + '\n'
     OutPutWiki = OutPutWiki + ''.join(CreditsInfo)
  # Save to File
  SaveToFile(OutPutWiki, MyOptions.wikiNS, MyOptions.outfolder, getutlforpage(MyOptions.url, 1))

if name == “main”:

  MyOptions = Parse_Params()

  if (MyOptions.wikiNS != None):
    MyOptions.wikiNS = MyOptions.wikiNS.replace('.', '_')
  if (MyOptions.mediaNS != None):
    MyOptions.mediaNS = MyOptions.mediaNS.replace('.', '_')
  print os.path.basename(sys.argv[0]) + " By " + base64.b64decode(KAuthor + '==') + " & " + base64.b64decode(KCoAuthor)
  print "Begin Convert"
  if (MyOptions.frompage != None):
     BeginPage = int(MyOptions.frompage)
  if (MyOptions.topage != None):
     EndPage = int(MyOptions.topage)
  if (BeginPage != -1) and (EndPage == -1):
     EndPage = BeginPage
  if (BeginPage == -1) and (EndPage != -1):
     BeginPage = EndPage
   
  if (BeginPage == -1) and (EndPage == -1):
     if int(MyOptions.pagecount) == 0:
        ConvertSinglePage(MyOptions.url)
     else:
        BeginPage = 1
        EndPage   = int(MyOptions.pagecount)
   
  if (BeginPage == EndPage):
     ConvertSinglePage(MyOptions.url)
  else:
     ConvertMultiPage(MyOptions.url, BeginPage, EndPage)
  print "Convert done"

</code>

Per testare il software mi sono fatto un piccolo batch

rem HTF2Wiki.py -u http://www.howtoforge.com/perfect-server-ubuntu-12.10-apache2-bind-dovecot-ispconfig-3-p2 -p7 -o .\Test -m linux:perfect_server_ubuntu_12.10
HTF2Wiki.py -u http://www.howtoforge.com/perfect-server-ubuntu-12.10-apache2-bind-dovecot-ispconfig-3-p2 -f1 -t7 -o .\Test -m linux:perfect_server_ubuntu_12.10

Le 2 righe sono equivalenti partono dalla prima pagina dell'articolo che si trova a quell'indirizzo e scaricano tutte e 7 le pagine dell'articolo in un unico file .txt scaricando anche le immagini

Allego anche il sorgente

Download Con ProgressBar

Un esempio di download con la progressbar

import urllib2
 
url = "http://download.thinkbroadband.com/10MB.zip"
 
file_name = url.split('/')[-1]
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (file_name, file_size)
 
file_size_dl = 0
block_sz = 8192
while True:
    buffer = u.read(block_sz)
    if not buffer:
        break
 
    file_size_dl += len(buffer)
    f.write(buffer)
    status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
    status = status + chr(8)*(len(status)+1)
    print status,
 
f.close()