Esempi di Python

HowToForge To Wiki Converter

In questo esempio si vede come lavora il parser HTML integrato in Python questo programmino serve per alimentare la presente Wiki con documentazioni varie provenienti da HowToForge Nell'esempio sono state usate solo librerie integrate in Python e la versione usata è la 2.7.x

#!/usr/bin/python
 
# ###############################################################################################
# HowToForge to Wiki converter
# ###############################################################################################
# Notes : Originaly developed in Embarcadero Delphi 2007 ... Ported to python for cross-platform
#         compatibility
# Date    : 20121128
# Author  : Ascanio Pressato
# CoAuthor: Massimo Fioravanti
# Lic.    : Creative Commen
# ###############################################################################################
 
import urllib2
import sys
import re
import os
import argparse 
import textwrap
import base64
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
 
 
# Const
KBeginH1 = '===== '
KEndH1 = ' ====='
KBeginH2 = '==== '
KEndH2 = ' ===='
KBeginH3 = '=== '
KEndH3 = ' ==='
KBeginH4 = '== '
KEndH4 = ' =='
KBeginH5 = '= '
KEndH5 = ' ='
 
KBold = '**'
KItalic = '//'
KUnderline = '__'
KBeginStrike = '<del>'
KEndStrike = '</del>'
KCodeStr = "''"
 
KBeginLink = '[['
KEndLink = ']]'
 
KNumList = '  - '
KSymbolList = '  * '
 
KLine = '----'
 
KBeginKey = '<key>'
KEndKey = '</key>'
 
KBeginBox = '<box>'
KEndBox = '</box>'
 
KBeginFile = '<file>'
KEndFile = '</file>'
 
KBeginCode = '<code>'
KEndCode = '

KBeginNote = '

' KEndNote = '

KBeginImage = 'kendimage'

KParagraph = r'
'

KSearchBeginTitle = r“(<title>)” KSearchEndTitle = r“(<\/title>)”

KInvalidUrlChar = ' :.,\/+*' KValidUrlChar = .ljust(len(KInvalidUrlChar),'_') KBeginData = “<!– begin content –>” KEndData = “




”
KEndData2  = “<!– AddThis Button END –>”
KEndData3  = “



”



OutPutWiki = []
SlClassCSS = []
CodeMode = False
IsScript = False
IsNote   = False 
IsCmd    = False
OkTrim   = False



KBaseUrl = 'http://www.howtoforge.com/'
CreditsString = “”
CreditsInfo = []
canWork = False
PassCount = 0
PagesList = []
KCreditsHeader = 'Revisioni'



BeginPage = -1
EndPage   = -1



KVersion  = “2.9 Python”
KAuthor   = “QXNjYW5pbyBQcmVzc2F0bw”
KCoAuthor = “TWFzc2ltbyBGaW9yYXZhbnRp”



# Transcode Table
#<p class=“command”></p>          =⇒  




#<p class=“system”></p>           =⇒  Italic
#<p></p>                          =⇒  Paragraph
#     =⇒  Italic
#  =⇒  Bold + Italic
#<pre></pre>                      =⇒  


  
  

  
  



#<br>                             =⇒  Paragraph
#<a href=“” >text</a>             =⇒  text
#ImageComment



# Begin & End Of HTML to Wikize
#
#



# Sample for Credits Extraction
# 



Submitted by <a href=“forums/member.php?u=2” title=“View user profile.” rel=“nofollow”>falko</a> (<a href=“forums/private.php?do=newpm&amp;u=2” title=“Contact author.” rel=“nofollow”>Contact Author</a>) (<a href=“forums” title=“Forums.”>Forums</a>) on Sun, 2012-10-21 17:33. :: <a href=“sitemap/control-panels/ispconfig”>ISPConfig</a> | <a href=“sitemap/linux/ubuntu”>Ubuntu</a> | <a href=“sitemap/web-server”>Web Server</a> | <a href=“sitemap/web-server/apache”>Apache</a> | <a href=“sitemap/control-panels”>Control Panels</a>



#



# ###############################################################################################
# VCL
# ###############################################################################################



def valueOf(List, Name):

  result = ""
  for eName, eValue in List:
      if eName == Name:
          result = eValue
  return result


def SameText(str1, str2):

  result = (str1.lower() == str2.lower())
  return result


def IncludeTrailingPathDelimiter(aPath):

  result = ""
  result = os.path.join(aPath, '')
  return result


def NameSpaceToPath(aNameSpace):

  result = ""
  NS = aNameSpace.split(':')
  for elem in NS:
      result = os.path.join(result, elem.replace(".", "_"))
  return result		


def ForceDirectories(aPath):

  if not os.path.exists(aPath):
      os.makedirs(aPath)


# ###############################################################################################
# GENUTIL
# ###############################################################################################



def SostituisciAllChar(St, Replaced, ReplacedTo):

  result = St
  for i in range(len(Replaced)):
     result = result.replace(Replaced[i], ReplacedTo[i])
  return result


# ###############################################################################################
# PROGRAM
# ###############################################################################################



def ResetAllVar ():

  SlClassCSS = []
  CodeMode = False
  IsScript = False
  IsNote   = False
  IsCmd    = False
  OkTrim   = False


def Parse_Params ():

  parser = argparse.ArgumentParser(usage = "%(prog)s [options]", version = "%(prog)s " + KVersion,
                                   formatter_class=argparse.RawTextHelpFormatter,	
								 description="Authors: " + base64.b64decode(KAuthor + '==') + " & " + base64.b64decode(KCoAuthor) + "\nConvert HowToForge.com Artivles in Dokuwiki Articles",
                                 epilog="Example: \n   %(prog)s -u http://www.howtoforge.com/perfect-server-ubuntu-12.04-lts-apache2-bind-dovecot-ispconfig-3 -p7  \n   %(prog)s -u http://www.howtoforge.com/perfect-server-ubuntu-12.04-lts-apache2-bind-dovecot-ispconfig-3 -f1 -t7")
  parser.add_argument("-p", "--pagecount", dest="pagecount", default=1,
                    help="Pages to download for this article [default: %(default)s]")
  parser.add_argument("-f", "--frompage", dest="frompage", 
                    help="Begin Page to download for this article (For Partial Download)")
  parser.add_argument("-t", "--topage", dest="topage", 
                    help="End Page to download for this article (For Partial Download)")
  parser.add_argument("-u", "--url", dest="url", metavar="URL", 
                    help="Url to convert")
  parser.add_argument("-w", "--wikiNS", dest="wikiNS", 
                    help="Wiki NameSpace")
  parser.add_argument("-m", "--mediaNS", dest="mediaNS", 
                    help="Media NameSpace")
  parser.add_argument("-c", "--creditstable", dest="creditstable", action="store_true", default=True,
                      help="Include Credits & Info Table [default: %(default)s]")
  parser.add_argument("-o", "--outfolder", dest="outfolder", metavar="FOLDER",
                      help="Folder to save results")
  parser.add_argument("-n", "--flatnames", dest="useflatnames", 
                    help="Begin Page to download for this article (For Partial Download)")
  options = parser.parse_args()
  if (options.url == None):
    print "\nWarning: How the Hell I can convert something if you don't specify URL !!!!";
  
  if (options.outfolder == None) or (options.url == None):
    parser.print_help()
    exit()
  
  if (options.outfolder == None):
    print "\nWarning: OutPut Folder not specified"; exit();

  return options


def GetTitle(Html):

  result = ""
  match = re.search(KSearchBeginTitle, Html)
  beginTitle = match.start(1)
  match = re.search(KSearchEndTitle, Html)
  endTitle = match.start(1)
  result = Html[beginTitle + 7:endTitle]
  result = result[:result.find('|')-1]
  return result
  


def CleanHTML(Html, PassNo):

  result = ""
  if PassNo == 1:
     Title = GetTitle(Html)
  else:
     Title = ""
  match = re.search(r"[^a-zA-Z](" + KBeginData + ")[^a-zA-Z]", Html)
  temp = Html[match.start(1):]
  if PassNo == 1:
     temp = temp[temp.replace('<p>', 'XXX', 1).find('<p>'):]
  else:
     temp = temp[temp.replace('</div>', 'XXX', 2).find('</div>')+5:]
  if PassNo == 1:
     result = "<html><body><h1>" + Title +"</h1><div class=\"autoinserted\">" +  temp
  else:
     result = "<html><body><div class=\"autoinserted\">" +  temp
  match = re.search(r"(" + KEndData3 + ")", result)
  result = result[:match.start(1)] + "</body></html>"
  return result


def ExtractFileNameFromUrl(aUrl):

  result = aUrl.split('/')[-1]
  return result


def GetFlatUrlName(aUrl):

  result = ''
  UrlTokens = aUrl.split('/')
  del UrlTokens[0]
  del UrlTokens[0]
  FileName = UrlTokens.pop()
  for Token in UrlTokens:
  result = result + Token.replace('.', '_') + '_'
  result = result + FileName
  return result


def GetPageOutputDir(aWikiNameSpace, aOutPutFolder):

  result = ""
  if (aWikiNameSpace != None):
      result = NameSpaceToPath(aWikiNameSpace)
  result = SostituisciAllChar(result, KInvalidUrlChar, KValidUrlChar)
  result = IncludeTrailingPathDelimiter(IncludeTrailingPathDelimiter(aOutPutFolder) + result)
  return result


def GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder):

  NormalizedNameSpace = ""
  if (aWikiMediaNameSpace != None):
     NormalizedNameSpace = NameSpaceToPath(aWikiMediaNameSpace)
  result = GetPageOutputDir(aWikiNameSpace, aOutPutFolder)
  result = IncludeTrailingPathDelimiter(IncludeTrailingPathDelimiter(result) + NormalizedNameSpace)
  return result


def GetImageName(aImgUrl, aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder, aUseFlatUrls):

  if aUseFlatUrls:
     result = GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder) + GetFlatUrlName(aImgUrl)
  else:
     result = GetImageOutputDir(aWikiMediaNameSpace, aWikiNameSpace, aOutPutFolder) + ExtractFileNameFromUrl(aImgUrl)
  return result


def combOutput(aFile):

  fd = open(aFile)
  contents = fd.readlines()
  fd.close()
  new_contents = []
  for line in contents:
      # Strip whitespace, should leave nothing if empty line was just "\n"
      if not line.strip():
         continue
      # We got something, save it
      else:
         new_contents.append(line)
  print "Comb File: " + aFile
  fo = file(aFile, 'w')
  for x in "".join(new_contents):
    fo.write(x)
  fo.close()


def SaveToFile(WikiText, aWikiNameSpace, aOutPutFolder, aUrl):

  MyDir = GetPageOutputDir(aWikiNameSpace, aOutPutFolder)
  ForceDirectories(MyDir)
  OutFileName = MyDir + GetFlatUrlName(aUrl).replace('.', '_') + '.txt'
  print "Saving File: " + OutFileName
  fo = file(OutFileName, 'w')
  for x in WikiText:
    fo.write(x)
  fo.close()
  combOutput(OutFileName)


def DownloadImage(Url, SaveName):

  downloaddir = os.path.dirname(SaveName)
  ForceDirectories(downloaddir)
  result = True
  webFile = urllib2.urlopen(Url)
  data = webFile.read()
  try:
     fileName = response.info()['content-disposition'].split('filename="')[1].split('"')[0]
  except:
     fileName = ExtractFileNameFromUrl(Url)
  localFile = os.path.join(downloaddir, fileName.replace('%20','_'))
  print "Download Image: ", localFile
  with open(localFile, "wb") as image:
       image.write(data)
       image.close()
  webFile.close()
  return result


def CustomAdjustLine(aLine):

  result = aLine
  result = aLine.replace("<vhost>", "[vhost]").replace("  //", "//").replace('<--', '<-').replace('  \\', '\\')
  return result


def CustomAdjustments(aData):

  print "Custom Adjustments begin"
  result = CustomAdjustLine(''.join(aData))
  print "Custom Adjustments end"
  return result


class MyCreditsParser(HTMLParser):

  def handle_starttag(self, tag, attrs):
      global CreditsString
      global canWork
      if SameText(tag, 'div') and SameText(valueOf(attrs, 'class'), 'info'): 
         canWork = True
      if SameText(tag, 'a') and SameText(valueOf(attrs, 'title'), 'View user profile.'): 
         CreditsString = CreditsString + '^ Autore | [[' + KBaseUrl + valueOf(attrs, 'href') + '|'       
  def handle_endtag(self, tag):
      global CreditsString
      global canWork
      global PassCount
      if SameText(tag, 'div'):
         canWork = False
      if SameText(tag, 'a'): 
         if (canWork) and (PassCount <1):
            CreditsString = CreditsString + ']] | \n'
            PassCount = PassCount + 1
  def handle_data(self, data):
      global CreditsString
      global canWork
      if (canWork):
         if (data.strip() != 'Submitted by') and (data.strip() != '(') and (data.strip() != ')') and (data.strip() != 'Forums') and (data.strip() !=') (') and (data.strip() !='Contact Author'):
            match = re.search("(on (Mon|Tue|Wed|Thu|Fri|Sat|Sun), \d{4}-\d{2}-\d{2})", data)
            if match and (match.groups() > 0):
               CreditsString = CreditsString + '^ Ultima Modifica | ' + data[match.start(0)+3:match.end(0)].replace(',', '') + ' | \n'
               canWork = False
            else:
               CreditsString = CreditsString + data.strip()


class MyHTMLParser(HTMLParser):

  def handle_starttag(self, tag, attrs):
     global SlClassCSS
     global CodeMode
     global IsScript
     global OutPutWiki
     global IsNote
     global OkTrim
     if SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'command'): 
          OutPutWiki.append('\n' + KBeginCode + '\n')
          CodeMode = True
          OkTrim = True
     elif SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'system'):
          OutPutWiki.append(KParagraph + KItalic)
          OkTrim = True
     elif SameText(tag, 'p') and SameText(valueOf(attrs, 'class'), 'highlight'):
          OutPutWiki.append(KParagraph + KBold + KItalic + KCodeStr)
          OkTrim = True
     elif SameText(tag, 'p') and (valueOf(attrs, 'class') == ""):
          OutPutWiki.append('\n' + KParagraph)
     elif SameText(tag, 'div') and SameText(valueOf(attrs, 'class'), 'sponsor'):
          OutPutWiki.append('\n' + KBeginNote + '\n')
          IsNote = True
     elif SameText(tag, 'h1'):
          OutPutWiki.append('\n' + KBeginH1)
     elif SameText(tag, 'h2'):
          OutPutWiki.append('\n' + KBeginH2)
     elif SameText(tag, 'h3'):
          OutPutWiki.append('\n' + KBeginH3)
     elif SameText(tag, 'h4'):
          if IsNote:
             OutPutWiki.append('\n' + KBold)
          else:
             OutPutWiki.append('\n' + KBeginH4)
     elif SameText(tag, 'h5'):
          OutPutWiki.append('\n' + KBeginH5)
     elif SameText(tag, 'span') and SameText(valueOf(attrs, 'class'), 'system'):
          OutPutWiki.append(KItalic + KCodeStr)
     elif SameText(tag, 'span') and SameText(valueOf(attrs, 'class'), 'highlight'):
          OutPutWiki.append(KBold + KItalic + KCodeStr)
     elif SameText(tag, 'li'):
          OutPutWiki.append('\n' + KSymbolList)
     elif SameText(tag, 'br'):
          if CodeMode:
             OutPutWiki.append('\n')
          else:
             OutPutWiki.append(KParagraph + '\n')
     elif SameText(tag, 'b') or SameText(tag, 'strong'):
         OutPutWiki.append(KBold)
     elif SameText(tag, 'i') or SameText(tag, 'em'):
         OutPutWiki.append(KItalic)
     elif SameText(tag, 'pre'):
         OutPutWiki.append('\n' + KBeginFile + '\n')
         CodeMode = True
     elif SameText(tag, 'script'):
         IsScript = True
     elif SameText(tag, 'noscript'):
         IsScript = True
     elif SameText(tag, 'a') and not SameText(valueOf(attrs, 'class'), 'thickbox'):
         OutPutWiki.append(KBeginLink + valueOf(attrs, 'href') + '|')      
     elif SameText(tag, 'img'):
         if not SameText(valueOf(attrs, 'alt'), 'Click to enlarge'):
            if DownloadImage(valueOf(attrs, 'src'),  GetImageName(valueOf(attrs, 'src'), MyOptions.mediaNS, MyOptions.wikiNS, MyOptions.outfolder, MyOptions.useflatnames)):
               MyNS = MyOptions.mediaNS
               if MyNS == None:
                  MyNS = ""
               else:
                  MyNS = ":" + MyNS + ":"
               if MyOptions.useflatnames:
                  ImgWikiTag = '\n' +  KBeginImage + MyNS + GetFlatUrlName(valueOf(attrs, 'src')) + '|' + KEndImage + '\n'
                  OutPutWiki.append(ImgWikiTag)
               else:
                  ImgWikiTag = '\n' +  KBeginImage + MyNS + ExtractFileNameFromUrl(valueOf(attrs, 'src')) + '|' + KEndImage + '\n'
                  OutPutWiki.append(ImgWikiTag)
            else:
               OutPutWiki.append('\n' +  ' #IMG:' + valueOf(attrs, 'src') + ' (Failed to Download)' + '\n')
     if (not SameText(tag, 'br')) and (not SameText(tag, 'img')):
        SlClassCSS.insert(0, (tag, valueOf(attrs, 'class')));
  def handle_endtag(self, tag):
     global SlClassCSS
     global CodeMode
     global IsScript
     global OutPutWiki
     global IsNote
     global OkTrim
     if SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'command'): 
          OutPutWiki.append('\n' + KEndCode + '\n')
          CodeMode = False
          OkTrim = False
     elif SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'highlight'):
          OutPutWiki.append(KCodeStr + KItalic + KBold + KParagraph)
          OkTrim = False
     elif SameText(tag, 'p') and SameText(SlClassCSS[0][1], 'system'):
          OutPutWiki.append(KItalic + KParagraph)
          OkTrim = False
     elif SameText(tag, 'p'):
          OutPutWiki.append(KParagraph + '\n')
     elif SameText(tag, 'div') and SameText(SlClassCSS[0][1], 'sponsor'):
          OutPutWiki.append('\n' + KEndNote + '\n')
          IsNote = False
     elif SameText(tag, 'h1'):
          OutPutWiki.append(KEndH1 +'\n')
     elif SameText(tag, 'h2'):
          OutPutWiki.append(KEndH2 +'\n')
     elif SameText(tag, 'h3'):
          OutPutWiki.append(KEndH3 +'\n')
     elif SameText(tag, 'h4'):
          if IsNote:
             OutPutWiki.append(KBold + '\n' + '----' + '\n')
          else:
             OutPutWiki.append(KEndH4 +'\n')
     elif SameText(tag, 'h5'):
          OutPutWiki.append(KEndH5 +'\n')
     elif SameText(tag, 'span') and SameText(SlClassCSS[0][1], 'system'):
          OutPutWiki.append(KCodeStr + KItalic)
     elif SameText(tag, 'span') and SameText(SlClassCSS[0][1], 'highlight'):
          OutPutWiki.append(KCodeStr + KItalic + KBold)
     elif SameText(tag, 'li'):
          OutPutWiki.append('\n')
     elif SameText(tag, 'b') or SameText(tag, 'strong'):
          OutPutWiki.append(KBold + ' ')
     elif SameText(tag, 'i') or SameText(tag, 'em'):
          OutPutWiki.append(KItalic)
     elif SameText(tag, 'pre'):
          OutPutWiki.append('\n' + KEndFile + '\n')
          CodeMode = False
     elif SameText(tag, 'script'):
          IsScript = False
     elif SameText(tag, 'noscript'):
          IsScript = False
     elif SameText(tag, 'a') and not SameText(SlClassCSS[0][1], 'thickbox'):
          OutPutWiki.append(KEndLink + ' ')
     if (not SameText(tag, 'br')) and (not SameText(tag, 'img')):
         if SameText(tag, SlClassCSS[0][0]):
            del SlClassCSS[0]
  def handle_entityref(self, name):
      if name == 'gt':
         OutPutWiki.append('>')
      elif name == 'lt':
         OutPutWiki.append('<')
      else:
         OutPutWiki.append(unichr(name2codepoint[name]).encode('utf8'))
  def handle_data(self, data):
      global CodeMode
      global IsScript
      global OkTrim
      if (not IsScript): 
         if (CodeMode) or (OkTrim): 
            OutPutWiki.append(data.strip())
         else: 
            OutPutWiki.append(data)


def getbaseurl(aUrl):

  result = aUrl
  match = re.search(r"(-p[0-9])", aUrl)
  try:
     EndUrl = match.start(1)
  except:
   EndUrl = len(aUrl)
  result = aUrl[:EndUrl]
  return result


def getutlforpage(aBaseUrl, aPageNo):

  result = getbaseurl(aBaseUrl)
  if aPageNo != 1:
     result = result + "-p" + str(aPageNo)
  return result


def getCredits(aCreditsTable):

  result = KBeginH2 + KCreditsHeader + KEndH2 + "\n\n"
  result = result + "^  " + KCreditsHeader +"  ^^ \n" + aCreditsTable + "^  Pagine  ^^ \n"
  for Page in PagesList:
      result = result + "| [[" + Page + "]] || \n" 
  return result


def ConvertPage(aUrl, aPassNo):

  global CreditsInfo
  print "Converting page: ", aUrl
# Track Pages
  PagesList.append(aUrl)
  # Reset Vars
  ResetAllVar()
  # Download HTML
  usock = urllib2.urlopen(aUrl)
  data = usock.read()
  usock.close()
# Prima di eliminare tutti i dati dei credits dall'html ...
  if CreditsInfo == []:
     CreditsParser = MyCreditsParser()
     CreditsParser.feed(data)
# Genero la tabella delle Revisioni / Credits
  CreditsInfo = []
  CreditsInfo.append(getCredits(CreditsString))
  # Clean HTML
  MyHtml = CleanHTML(data, aPassNo)
  #SaveToFile(MyHtml, MyOptions.wikiNS, MyOptions.outfolder, aUrl + "_" + str(aPassNo))
  # Parse HTML
  parser = MyHTMLParser()
  parser.feed(MyHtml)


def ConvertSinglePage(aUrl):

  global OutPutWiki
  ConvertPage(aUrl, 1)
  # Conversioni "speciali"
  OutPutWiki = CustomAdjustments(OutPutWiki)
# Aggiungo la CreditsTable in fondo all'articolo
  if MyOptions.creditstable:
     OutPutWiki.append('\n')
     OutPutWiki.append(''.join(CreditsInfo))	   
  # Save to File
  SaveToFile(OutPutWiki, MyOptions.wikiNS, MyOptions.outfolder, aUrl)


def ConvertMultiPage(aUrl, aBeginPage, aEndPage):

  global OutPutWiki
  for idx in range(aBeginPage, aEndPage + 1):
      ConvertPage(getutlforpage(MyOptions.url, idx), idx)
  # Conversioni "speciali"
  OutPutWiki = CustomAdjustments(OutPutWiki)
# Aggiungo la CreditsTable in fondo all'articolo
  if MyOptions.creditstable:
     OutPutWiki = OutPutWiki + '\n'
     OutPutWiki = OutPutWiki + ''.join(CreditsInfo)
  # Save to File
  SaveToFile(OutPutWiki, MyOptions.wikiNS, MyOptions.outfolder, getutlforpage(MyOptions.url, 1))


if name == “main”:

  MyOptions = Parse_Params()

  if (MyOptions.wikiNS != None):
    MyOptions.wikiNS = MyOptions.wikiNS.replace('.', '_')
  if (MyOptions.mediaNS != None):
    MyOptions.mediaNS = MyOptions.mediaNS.replace('.', '_')
  print os.path.basename(sys.argv[0]) + " By " + base64.b64decode(KAuthor + '==') + " & " + base64.b64decode(KCoAuthor)
  print "Begin Convert"
  if (MyOptions.frompage != None):
     BeginPage = int(MyOptions.frompage)
  if (MyOptions.topage != None):
     EndPage = int(MyOptions.topage)
  if (BeginPage != -1) and (EndPage == -1):
     EndPage = BeginPage
  if (BeginPage == -1) and (EndPage != -1):
     BeginPage = EndPage
   
  if (BeginPage == -1) and (EndPage == -1):
     if int(MyOptions.pagecount) == 0:
        ConvertSinglePage(MyOptions.url)
     else:
        BeginPage = 1
        EndPage   = int(MyOptions.pagecount)
   
  if (BeginPage == EndPage):
     ConvertSinglePage(MyOptions.url)
  else:
     ConvertMultiPage(MyOptions.url, BeginPage, EndPage)
  print "Convert done"


</code>



Per testare il software mi sono fatto un piccolo batch

rem HTF2Wiki.py -u http://www.howtoforge.com/perfect-server-ubuntu-12.10-apache2-bind-dovecot-ispconfig-3-p2 -p7 -o .\Test -m linux:perfect_server_ubuntu_12.10
HTF2Wiki.py -u http://www.howtoforge.com/perfect-server-ubuntu-12.10-apache2-bind-dovecot-ispconfig-3-p2 -f1 -t7 -o .\Test -m linux:perfect_server_ubuntu_12.10


Le 2 righe sono equivalenti partono dalla prima pagina dell'articolo che si trova a quell'indirizzo e scaricano tutte e 7 le pagine dell'articolo in un unico file .txt scaricando anche le immagini



Allego anche il sorgente


Download Con ProgressBar


Un esempio di download con la progressbar

import urllib2
 
url = "http://download.thinkbroadband.com/10MB.zip"
 
file_name = url.split('/')[-1]
u = urllib2.urlopen(url)
f = open(file_name, 'wb')
meta = u.info()
file_size = int(meta.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (file_name, file_size)
 
file_size_dl = 0
block_sz = 8192
while True:
    buffer = u.read(block_sz)
    if not buffer:
        break
 
    file_size_dl += len(buffer)
    f.write(buffer)
    status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
    status = status + chr(8)*(len(status)+1)
    print status,
 
f.close()