Hi,
I would suggest a more specific way of extracting data for each filetype, something more like this:
Code:
private void Test()
{
string sourceFileName = @"C:\My_Files\Test.txt";
//string sourceFileName = @"C:\My_Files\Test.doc";
//string sourceFileName = @"C:\My_Files\Test.xlsx";
StringBuilder FileTextBuilder = new StringBuilder();
string TBFormatted = "";
if(sourceFileName.Contains(".txt"){
using (FileStream ReadStream = File.OpenRead(sourceFileName))
{
byte[] DataTransit = new byte[ReadStream.Length + 1];
UTF8Encoding DataEncoding = new UTF8Encoding(true);
while (ReadStream.Read(DataTransit, 0, DataTransit.Length) > 0)
{
FileTextBuilder.Append(DataEncoding.GetString(DataTransit));
}
//read in your .txt file to a string and format it. Pdfsharp/MigraDoc like to use /r for new line characters, whereas other formatters use /n and sometimes /r/n, it will depend on the content
TBFormatted = FormatTXT(TextBuilder.ToString());
}
}
if(sourceFileName.Contains(".doc"){
//read in your .doc file to a string and format it
TBFormatted = ExtractDOC(sourceFileName);
}
else if(sourceFileName.Contains(".xlsx"){
//read in your .xlsx file to a string and format it.
TBFormatted = ExtractXLSX(sourceFileName);
}
string sourceContent = TBFormatted;
PdfDocument document = new PdfDocument();
document.Info.Title = "Created with PDFsharp";
PdfPage page = document.AddPage();
XGraphics gfx = XGraphics.FromPdfPage(page);
XFont font = new XFont("Verdana", 10, XFontStyle.Regular);
gfx.DrawString(sourceContent, font, XBrushes.Black,
new XRect(0, 0, page.Width, page.Height),
XStringFormats.TopLeft);
string destinationFileName = @"C:\My_Files\Test.pdf";
document.Save(destinationFileName);
document.Close();
}
private string FormatTXT(string input){
return something;
}
private string ExtractDOC(string input){
return something;
}
private string ExtractXLSX(string input){
return something;
}
That code probably isnt 100% correct as ive not tested it and im slightly unfamiliar with the way your reading the test from the File, but hopefully you get the idea.
I read text in like this:
Code:
public static string LoadFile(string filePath)
{
TextReader tr = new StreamReader(filePath);
string _fileString = tr.ReadToEnd();
tr.Close();
tr.Dispose();
return _fileString;
}
I think that as the formats are different, then you will have to extract the data from them in different ways. .txt shouldnt be too much of an issue, but for .doc, .xls look at the Office 2003 format, and for .docx and .xlsx look at office 2007 format and Open XML (as this is the format used to save Office 2007+ files) Open XML has its own set of objects for interpreting office 2007 files (and also OpenOffice.org i believe) - look at
http://openxmldeveloper.org for more info.
Becuase different fileformats store data in different ways, this is why I think you should limit the number of possible upload formats as you will have to address each one seperately...
Hope this helps,
Mike